chore: kernel source patches, local recipe updates, and build artifacts

Kernel source (ephemeral — changes durable in local/patches/kernel/):
- P20 x2apic ICR mode fix, P21 x2apic SMP fix applied
- ACPI MADT, RSDP, SDT improvements
- Context switch, percpu, event, IRQ scheme updates
- MSI/vector allocation, NUMA/SLIT/SRAT support

Local recipe source updates:
- redox-driver-acpi: bus/prt hardening
- redox-drm: Intel display, KMS connector improvements
- driver-manager: config/scheme hardening
- thermald: main.rs fix
- uutils-tar, ninja-build: source updates

Other:
- bootloader, installer, redoxfs, relibc, userutils source updates
- recipe.toml.backup, libxcvt source directory
This commit is contained in:
2026-05-18 14:20:54 +03:00
parent 29ff1ea8fc
commit 0cbad35638
72 changed files with 5058 additions and 599 deletions
+260
View File
@@ -0,0 +1,260 @@
[source]
git = "https://gitlab.redox-os.org/redox-os/base.git"
rev = "463f76b9608a896e6f6c9f63457f57f6409873c7"
patches = [
"P0-daemon-fix-init-notify-unwrap.patch",
"P0-workspace-add-bootstrap.patch",
"P0-init-continuous-scheduling.patch",
"P0-dhcpd-auto-iface.patch",
"P0-procmgr-sigchld-debug.patch",
"P0-pcid-mcfg-diagnostics.patch",
"P0-ihdgd-intel-gpu-ids.patch",
"P0-acpid-dmar-fix.patch",
# P1: acpid EC runtime and AML physmem hardening (narrow ACPI runtime patches)
"P1-acpid-ec-runtime.patch",
"P1-acpid-runtime-hardening.patch",
# Stale patches needing recreation: P1-pcid-uevent-surface, P2-boot-runtime-fixes,
# P2-hwd-misc, P2-pcid-cfg-access, P3-xhci-device-hardening, P6-cpufreqd-real-impl
"P2-i2c-gpio-ucsi-drivers.patch",
"P0-i2c-control-response-empty.patch",
"P2-ihdad-graceful-init.patch",
"P2-boot-logging.patch",
"P2-init-acpid-wiring.patch",
"P2-hwd-remove-acpid-spawn.patch",
"P2-initfs-pcid-service.patch",
"P2-misc-daemon-fixes.patch",
"P9-fix-so-pecred.patch",
"P3-inputd-keymap-bridge.patch",
# P3: ps2d consolidated — LED feedback, mouse resend, fastfail, Intellimouse2, controller init robustness, non-x86 fallback
"P7-ps2d-intellimouse2-leds-controller-init.patch",
"P3-usbhidd-hardening.patch",
"P3-init-colored-output.patch",
"P4-logd-persistent-logging.patch",
"P4-acpi-shutdown-hardening.patch",
"P4-acpi-s3-sleep.patch",
"P4-pcid-public-client-channel.patch",
"P4-pcid-config-scheme.patch",
"P4-pcid-spawner-pci-coordinate-env.patch",
"P4-initfs-usb-drm-services.patch",
"P4-initfs-release-virtio-gpu.patch",
"P4-initfs-network-services.patch",
"P4-initfs-getty-services.patch",
"P4-initfs-dbus-services.patch",
"P4-fbcond-scrollback.patch",
# P4: ucsid graceful ESTALE/ENOENT handling — don't crash when /scheme/acpi/symbols unavailable
"P4-ucsid-estale-graceful.patch",
# P4: Extend ESTALE/ENOENT graceful handling to all ACPI-reading daemons
"P4-acpi-estale-graceful.patch",
# P4: hwd graceful ESTALE/ENOENT handling in probe()
"P4-hwd-estale-graceful.patch",
# P5: i2c-hidd + intel-thc-hidd: boot-time ESTALE/ENOENT retry with exponential backoff
"P5-i2c-hidd-estale-retry.patch",
# P5: acpid /scheme/acpi/dmi SMBIOS endpoint for quirk matching
"P5-acpid-dmi-endpoint.patch",
"P4-thermal-daemon.patch",
"P4-thermald-workspace.patch",
"P6-driver-main-fixes.patch",
"P6-driver-new-modules.patch",
"P9-init-scheduler-completed.patch",
"P2-pcid-acpid-graceful-fd.patch",
# P5: Graceful DRM ioctl error handling in fbbootlogd/fbcond (avoid ENOTTY crash)
"P5-fbbootlogd-fbcond-graceful-drm.patch",
# P6: Fix rtcd EEXIST by avoiding O_CREAT on kernel scheme resource
"P6-rtcd-no-ocreat.patch",
# P6: Init hard requires dependency — blocks startup if dependency missing
"P6-init-requires-hard-dep.patch",
# P6: Fix pcid→acpid FD transfer — pass FD in metadata array, not payload
"P6-pcid-acpid-fd-transfer.patch",
# P7: Fix acpid pci_fd startup race — shared RwLock between scheme and AML handler
"P7-acpid-shared-pcifd.patch",
# P15: Init service timeout — prevent boot hanging on unresponsive daemons (30s default)
"P15-7-init-service-timeout.patch",
# P15: Dependency cycle detection in unit loader — log and skip circular requires_weak
"P15-8-init-cycle-detection.patch",
# P18: Init daemon restart policy — supervise Notify/Scheme services with exponential backoff
"P18-1-daemon-restart.patch",
# P18: ACPID robustness — RSDP BIOS-area fallback, graceful physmem error handling
"P18-5-acpid-robustness.patch",
# P18: MSI/MSI-X enablement — skip legacy IRQ for MSI-capable devices
"P18-3-msi-msix-enablement.patch",
# P18: Bounded IPC queues — backlog limits for chan, UDS stream, UDS dgram
"P18-8-bounded-ipcd-queues.patch",
# P18: MSI/MSI-X allocation resilience — handle EEXIST, fallback chain MSI-X→MSI→legacy
"P18-9-msi-allocation-resilience.patch",
]
[package]
installs = [
"/lib/pcid.d/ac97d.toml",
"/lib/pcid.d/e1000d.toml",
"/lib/pcid.d/ihdad.toml",
"/lib/pcid.d/ihdgd.toml",
"/lib/pcid.d/ixgbed.toml",
"/lib/pcid.d/rtl8139d.toml",
"/lib/pcid.d/rtl8168d.toml",
"/lib/pcid.d/vboxd.toml",
"/lib/pcid.d/virtio-netd.toml",
"/lib/pcid.d/xhcid.toml",
"/usr/bin/audiod",
"/usr/bin/dhcpd",
"/usr/bin/dw-acpi-i2cd",
"/usr/bin/gpiod",
"/usr/bin/i2cd",
"/usr/bin/i2c-gpio-expanderd",
"/usr/bin/i2c-hidd",
"/usr/bin/inputd",
"/usr/bin/intel-gpiod",
"/usr/bin/ipcd",
"/usr/bin/netstack",
"/usr/bin/pcid",
"/usr/bin/pcid-spawner",
"/usr/bin/ptyd",
"/usr/bin/redoxerd",
"/usr/bin/smolnetd",
"/usr/bin/ucsid",
"/usr/lib/drivers/ac97d",
"/usr/lib/drivers/ahcid",
"/usr/lib/drivers/amd-mp2-i2cd",
"/usr/lib/drivers/e1000d",
"/usr/lib/drivers/ihdad",
"/usr/lib/drivers/ihdgd",
"/usr/lib/drivers/ided",
"/usr/lib/drivers/intel-lpss-i2cd",
"/usr/lib/drivers/intel-thc-hidd",
"/usr/lib/drivers/ixgbed",
"/usr/lib/drivers/ps2d",
"/usr/lib/drivers/rtl8139d",
"/usr/lib/drivers/rtl8168d",
"/usr/lib/drivers/sb16d",
"/usr/lib/drivers/thermald",
"/usr/lib/drivers/usbctl",
"/usr/lib/drivers/usbhidd",
"/usr/lib/drivers/usbhubd",
"/usr/lib/drivers/usbscsid",
"/usr/lib/drivers/vboxd",
"/usr/lib/drivers/virtio-gpud",
"/usr/lib/drivers/virtio-netd",
"/usr/lib/drivers/xhcid",
"/usr/lib/init.d/00_base.target",
"/usr/lib/init.d/00_ipcd.service",
"/usr/lib/init.d/00_pcid-spawner.service",
"/usr/lib/init.d/00_ptyd.service",
"/usr/lib/init.d/00_sudo.service",
"/usr/lib/init.d/00_tmp",
"/usr/lib/init.d/05_boot_essential.target",
"/usr/lib/init.d/10_dhcpd.service",
"/usr/lib/init.d/10_net.target",
"/usr/lib/init.d/10_smolnetd.service",
"/usr/lib/init.d/12_boot_late.target",
"/usr/lib/init.d/12_dbus.service",
"/usr/lib/init.d/13_seatd.service",
"/usr/lib/init.d/13_sessiond.service",
"/usr/lib/init.d/20_audiod.service",
"/usr/lib/init.d/29_activate_console.service",
"/usr/lib/init.d/30_console.service",
"/usr/lib/init.d/30_thermald.service",
"/usr/lib/init.d/31_debug_console.service",
]
[build]
template = "custom"
script = """
mkdir -pv "${COOKBOOK_STAGE}/usr/bin"
for package in audiod ipcd ptyd dhcpd; do
"${COOKBOOK_CARGO}" build \
--manifest-path "${COOKBOOK_SOURCE}/${package}/Cargo.toml" \
--target "${TARGET}" \
${build_flags}
cp -v \
"target/${TARGET}/${build_type}/${package}" \
"${COOKBOOK_STAGE}/usr/bin/${package}"
done
"${COOKBOOK_CARGO}" build \
--manifest-path "${COOKBOOK_SOURCE}/netstack/Cargo.toml" \
--target "${TARGET}" \
${build_flags}
cp -v \
"target/${TARGET}/${build_type}/netstack" \
"${COOKBOOK_STAGE}/usr/bin/netstack"
cp -v \
"target/${TARGET}/${build_type}/netstack" \
"${COOKBOOK_STAGE}/usr/bin/smolnetd"
# Drivers that are built on all architectures, and NOT in drivers-initfs
BINS=(
gpiod
i2c-gpio-expanderd
intel-gpiod
amd-mp2-i2cd
dw-acpi-i2cd
e1000d
ihdad
ihdgd
i2c-hidd
intel-thc-hidd
intel-lpss-i2cd
ixgbed
pcid
pcid-spawner
rtl8139d
rtl8168d
usbctl
usbhidd
thermald
usbhubd
ucsid
usbscsid
virtio-gpud
virtio-netd
xhcid
i2cd
inputd
redoxerd
)
# Add additional drivers to the list to build, that are not in drivers-initfs
# depending on the target architecture
case "${TARGET}" in
i586-unknown-redox | i686-unknown-redox | x86_64-unknown-redox)
BINS+=(ac97d ahcid ided nvmed ps2d sb16d vboxd)
;;
*)
;;
esac
#Build each driver in the list
mkdir -pv "${COOKBOOK_STAGE}/usr/bin" "${COOKBOOK_STAGE}/usr/lib/drivers"
export CARGO_PROFILE_RELEASE_OPT_LEVEL=s
export CARGO_PROFILE_RELEASE_PANIC=abort
# Only build drivers that actually have source Cargo.toml entries
EXISTING_BINS=()
for bin in "${BINS[@]}"
do
if grep -Rqs "^name = \\\"${bin}\\\"$" "${COOKBOOK_SOURCE}"; then
EXISTING_BINS+=("${bin}")
fi
done
"${COOKBOOK_CARGO}" build ${build_flags} \
--manifest-path "${COOKBOOK_SOURCE}/Cargo.toml" \
--target "${TARGET}" \
$(for bin in "${EXISTING_BINS[@]}"; do echo "-p" "${bin}"; done)
for bin in "${EXISTING_BINS[@]}"
do
if [[ "${bin}" == "gpiod" || "${bin}" == "i2c-gpio-expanderd" || "${bin}" == "intel-gpiod" || "${bin}" == "i2cd" || "${bin}" == "dw-acpi-i2cd" || "${bin}" == "i2c-hidd" || "${bin}" == "inputd" || "${bin}" == "pcid" || "${bin}" == "pcid-spawner" || "${bin}" == "redoxerd" || "${bin}" == "ucsid" ]]; then
cp -v "target/${TARGET}/${build_type}/${bin}" "${COOKBOOK_STAGE}/usr/bin"
else
cp -v "target/${TARGET}/${build_type}/${bin}" "${COOKBOOK_STAGE}/usr/lib/drivers"
fi
done
mkdir -pv "${COOKBOOK_STAGE}/lib/pcid.d"
find "${COOKBOOK_SOURCE}/drivers" -maxdepth 3 -type f -name 'config.toml' | while read conf
do
driver="$(basename "$(dirname "$conf")")"
cp -v "$conf" "${COOKBOOK_STAGE}/lib/pcid.d/$driver.toml"
done
mkdir -pv "${COOKBOOK_STAGE}/usr/lib/init.d"
cp -v "${COOKBOOK_SOURCE}/init.d"/* "${COOKBOOK_STAGE}/usr/lib/init.d/"
"""
+1
View File
@@ -12,6 +12,7 @@ cc = "1.0"
toml = "0.8"
[dependencies]
acpi_ext = { package = "acpi", git = "https://gitlab.redox-os.org/redox-os/acpi.git", branch = "redox-6.x" }
arrayvec = { version = "0.7.4", default-features = false }
bitfield = "0.13.2"
bitflags = "2"
+1
View File
@@ -1,3 +1,4 @@
# Red Bear OS kernel patches applied via individual patch files
.PHONY: all check
SOURCE:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
+13
View File
@@ -77,6 +77,7 @@ fn main() {
}
"x86_64" => {
println!("cargo::rerun-if-changed=src/asm/x86_64/trampoline.asm");
println!("cargo::rerun-if-changed=src/asm/x86_64/s3_wakeup.asm");
let status = Command::new("nasm")
.arg("-f")
@@ -89,6 +90,18 @@ fn main() {
if !status.success() {
panic!("nasm failed with exit status {}", status);
}
let status = Command::new("nasm")
.arg("-f")
.arg("bin")
.arg("-o")
.arg(format!("{}/s3_wakeup", out_dir))
.arg("src/asm/x86_64/s3_wakeup.asm")
.status()
.expect("failed to run nasm");
if !status.success() {
panic!("nasm failed with exit status {}", status);
}
}
"riscv64" => {
println!("cargo::rustc-cfg=dtb");
@@ -189,8 +189,18 @@ pub(super) fn init(madt: Madt) {
let preliminary_cpu_count = madt
.iter()
.filter(|entry| match entry {
MadtEntry::LocalApic(local) => u32::from(local.id) == me.get() || local.flags & 1 == 1,
MadtEntry::LocalX2Apic(local) => local.x2apic_id == me.get() || local.flags & 1 == 1,
// When x2APIC is active, LocalApic entries use 8-bit IDs that don't
// match the BSP's 32-bit x2APIC ID. Use LocalX2Apic entries instead.
MadtEntry::LocalApic(local) if !local_apic.x2 => {
u32::from(local.id) == me.get() || local.flags & 1 == 1
}
MadtEntry::LocalApic(_) => false,
// xAPIC mode: cannot use 32-bit x2APIC IDs via 8-bit ICR.
// Skip LocalX2Apic entries and use LocalApic exclusively.
MadtEntry::LocalX2Apic(local) if local_apic.x2 => {
local.x2apic_id == me.get() || local.flags & 1 == 1
}
MadtEntry::LocalX2Apic(_) => false,
_ => false,
})
.count();
@@ -205,18 +215,28 @@ pub(super) fn init(madt: Madt) {
let _ = seen_apic_ids.insert(me.get()); // BSP
for entry in madt.iter() {
match entry {
MadtEntry::LocalApic(local) if local.flags & 1 == 1 => {
MadtEntry::LocalApic(local) if local.flags & 1 == 1 && !local_apic.x2 => {
let id = u32::from(local.id);
if !seen_apic_ids.insert(id) {
warn!("MADT: duplicate APIC ID {} in LocalApic entry, firmware bug", id);
}
}
MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 => {
MadtEntry::LocalApic(local) if local.flags & 1 == 1 && local_apic.x2 => {
// x2APIC mode: skip 8-bit LocalApic IDs; they conflict with
// 32-bit x2APIC IDs. Dedup only among LocalX2Apic entries.
debug!("MADT: ignoring 8-bit LocalApic ID {} in x2APIC mode", local.id);
}
MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 && local_apic.x2 => {
let id = local.x2apic_id;
if !seen_apic_ids.insert(id) {
warn!("MADT: duplicate x2APIC ID {} in LocalX2Apic entry, firmware bug", id);
}
}
MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 && !local_apic.x2 => {
// xAPIC mode: skip 32-bit x2APIC IDs; dedup only among LocalApic entries.
let id = local.x2apic_id; // Copy from packed struct
debug!("MADT: ignoring 32-bit x2APIC ID {} in xAPIC mode", id);
}
_ => {}
}
}
@@ -225,7 +245,16 @@ pub(super) fn init(madt: Madt) {
for madt_entry in madt.iter() {
debug!(" {:x?}", madt_entry);
if let MadtEntry::LocalApic(ap_local_apic) = madt_entry {
if u32::from(ap_local_apic.id) == me.get() {
// x2APIC mode: LocalApic entries have 8-bit IDs that don't match
// the BSP's 32-bit x2APIC ID. All entries would be treated as APs,
// and SIPI would target the wrong processors. Skip them and rely
// on LocalX2Apic entries exclusively.
if local_apic.x2 {
debug!(
" Skipping 8-bit LocalApic id={} (x2APIC active, using LocalX2Apic entries)",
ap_local_apic.id
);
} else if u32::from(ap_local_apic.id) == me.get() {
debug!(" This is my local APIC");
} else if ap_local_apic.flags & 1 == 1 {
// Allocate a stack
@@ -383,14 +412,19 @@ pub(super) fn init(madt: Madt) {
}
RmmA::invalidate_all();
} else {
debug!("KERNEL AP: LAPIC CPU {} disabled in MADT, skipping", u32::from(ap_local_apic.id));
}
} else if let MadtEntry::LocalX2Apic(ap_x2apic) = madt_entry {
let apic_id = ap_x2apic.x2apic_id;
let flags = ap_x2apic.flags;
if apic_id == me.get() {
// xAPIC mode: cannot target 32-bit x2APIC IDs via 8-bit ICR.
// Skip LocalX2Apic entries; use LocalApic entries exclusively.
if !local_apic.x2 {
debug!(
" Skipping 32-bit x2APIC id={} (xAPIC mode, using LocalApic entries)",
apic_id
);
} else if apic_id == me.get() {
debug!(" This is my local x2APIC");
} else if flags & 1 == 1 {
let alloc = match allocate_p2frame(4) {
@@ -446,11 +480,7 @@ pub(super) fn init(madt: Madt) {
// Send INIT IPI (Assert)
{
let mut icr = 0x4500u64;
if local_apic.x2 {
icr |= u64::from(apic_id) << 32;
} else {
icr |= u64::from(apic_id as u8) << 56;
}
icr |= u64::from(apic_id) << 32;
local_apic.set_icr(icr);
}
@@ -461,11 +491,7 @@ pub(super) fn init(madt: Madt) {
{
let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
let mut icr = 0x0600u64 | ap_segment as u64;
if local_apic.x2 {
icr |= u64::from(apic_id) << 32;
} else {
icr |= u64::from(apic_id as u8) << 56;
}
icr |= u64::from(apic_id) << 32;
local_apic.set_icr(icr);
}
@@ -476,11 +502,7 @@ pub(super) fn init(madt: Madt) {
{
let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
let mut icr = 0x0600u64 | ap_segment as u64;
if local_apic.x2 {
icr |= u64::from(apic_id) << 32;
} else {
icr |= u64::from(apic_id as u8) << 56;
}
icr |= u64::from(apic_id) << 32;
local_apic.set_icr(icr);
}
@@ -534,8 +556,6 @@ pub(super) fn init(madt: Madt) {
}
RmmA::invalidate_all();
} else {
debug!("KERNEL AP: x2APIC CPU {} disabled in MADT (flags={:#x}), skipping", apic_id, flags);
}
} else if let MadtEntry::LocalApicNmi(nmi) = madt_entry {
let target_apic = nmi.processor;
@@ -34,6 +34,12 @@ impl Madt {
let madt = Madt::new(find_one_sdt!("APIC"));
if let Some(madt) = madt {
// Validate MADT checksum per ACPI 6.5 §5.2.2
if !madt.sdt.validate_checksum() {
error!("MADT checksum validation failed, skipping APIC initialization");
return;
}
// safe because no APs have been started yet.
unsafe { MADT.get().write(Some(madt)) };
@@ -146,6 +152,48 @@ pub struct MadtGicd {
_reserved2: [u8; 3],
}
/// MADT Local x2APIC (entry type 0x9)
#[derive(Clone, Copy, Debug)]
#[repr(C, packed)]
pub struct MadtLocalX2Apic {
_reserved: u16,
pub x2apic_id: u32,
pub flags: u32,
pub processor_uid: u32,
}
/// MADT Local APIC NMI (entry type 0x4)
#[derive(Clone, Copy, Debug)]
#[repr(C, packed)]
pub struct MadtLocalApicNmi {
pub processor: u8,
pub flags: u16,
pub nmi_pin: u8,
}
/// MADT Local APIC address override (entry type 0x5)
#[derive(Clone, Copy, Debug)]
#[repr(C, packed)]
pub struct MadtLapicAddressOverride {
_reserved: u16,
pub local_apic_address: u64,
}
/// MADT Local x2APIC NMI (entry type 0xA)
#[derive(Clone, Copy, Debug)]
#[repr(C, packed)]
pub struct MadtLocalX2ApicNmi {
_reserved: u16,
pub processor_uid: u32,
pub flags: u16,
pub nmi_pin: u8,
_reserved2: u8,
}
const _: () = assert!(size_of::<MadtLocalApicNmi>() == 4);
const _: () = assert!(size_of::<MadtLapicAddressOverride>() == 10);
const _: () = assert!(size_of::<MadtLocalX2ApicNmi>() == 10);
/// MADT Entries
#[derive(Debug)]
#[allow(dead_code)]
@@ -156,10 +204,18 @@ pub enum MadtEntry {
InvalidIoApic(usize),
IntSrcOverride(&'static MadtIntSrcOverride),
InvalidIntSrcOverride(usize),
LocalApicNmi(&'static MadtLocalApicNmi),
InvalidLocalApicNmi(usize),
LapicAddressOverride(&'static MadtLapicAddressOverride),
InvalidLapicAddressOverride(usize),
Gicc(&'static MadtGicc),
InvalidGicc(usize),
Gicd(&'static MadtGicd),
InvalidGicd(usize),
LocalX2Apic(&'static MadtLocalX2Apic),
InvalidLocalX2Apic(usize),
LocalX2ApicNmi(&'static MadtLocalX2ApicNmi),
InvalidLocalX2ApicNmi(usize),
Unknown(u8),
}
@@ -176,6 +232,10 @@ impl Iterator for MadtIter {
let entry_len =
unsafe { *(self.sdt.data_address() as *const u8).add(self.i + 1) } as usize;
if entry_len < 2 {
return None;
}
if self.i + entry_len <= self.sdt.data_len() {
let item = match entry_type {
0x0 => {
@@ -206,6 +266,46 @@ impl Iterator for MadtIter {
MadtEntry::InvalidIntSrcOverride(entry_len)
}
}
0x4 => {
if entry_len == size_of::<MadtLocalApicNmi>() + 2 {
MadtEntry::LocalApicNmi(unsafe {
&*((self.sdt.data_address() + self.i + 2)
as *const MadtLocalApicNmi)
})
} else {
MadtEntry::InvalidLocalApicNmi(entry_len)
}
}
0x5 => {
if entry_len == size_of::<MadtLapicAddressOverride>() + 2 {
MadtEntry::LapicAddressOverride(unsafe {
&*((self.sdt.data_address() + self.i + 2)
as *const MadtLapicAddressOverride)
})
} else {
MadtEntry::InvalidLapicAddressOverride(entry_len)
}
}
0x9 => {
if entry_len == size_of::<MadtLocalX2Apic>() + 2 {
MadtEntry::LocalX2Apic(unsafe {
&*((self.sdt.data_address() + self.i + 2)
as *const MadtLocalX2Apic)
})
} else {
MadtEntry::InvalidLocalX2Apic(entry_len)
}
}
0xA => {
if entry_len == size_of::<MadtLocalX2ApicNmi>() + 2 {
MadtEntry::LocalX2ApicNmi(unsafe {
&*((self.sdt.data_address() + self.i + 2)
as *const MadtLocalX2ApicNmi)
})
} else {
MadtEntry::InvalidLocalX2ApicNmi(entry_len)
}
}
0xB => {
if entry_len >= size_of::<MadtGicc>() + 2 {
MadtEntry::Gicc(unsafe {
@@ -20,6 +20,8 @@ mod rxsdt;
pub mod sdt;
#[cfg(target_arch = "aarch64")]
mod spcr;
pub mod slit;
pub mod srat;
mod xsdt;
unsafe fn map_linearly(addr: PhysicalAddress, len: usize, mapper: &mut crate::memory::PageMapper) {
@@ -82,6 +84,14 @@ impl Rxsdt for RxsdtEnum {
pub static RXSDT_ENUM: Once<RxsdtEnum> = Once::new();
#[derive(Clone, Copy, Debug)]
pub struct AcpiRootInfo {
pub revision: u8,
pub root_sdt_address: PhysicalAddress,
}
pub static ACPI_ROOT_INFO: Once<AcpiRootInfo> = Once::new();
/// Parse the ACPI tables to gather CPU, interrupt, and timer information
pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {
unsafe {
@@ -94,6 +104,15 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {
let rsdp_opt = Rsdp::get_rsdp(already_supplied_rsdp);
if let Some(rsdp) = rsdp_opt {
let root_info = ACPI_ROOT_INFO.call_once(|| AcpiRootInfo {
revision: rsdp.revision(),
root_sdt_address: rsdp.sdt_address(),
});
if root_info.root_sdt_address != rsdp.sdt_address() || root_info.revision != rsdp.revision() {
error!("ACPI_ROOT_INFO already initialized with a different RSDP root");
}
debug!("SDT address: {:#x}", rsdp.sdt_address().data());
let rxsdt = get_sdt(rsdp.sdt_address(), &mut KernelMapper::lock_rw());
@@ -146,7 +165,14 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {
// TODO: Enumerate processors in userspace, and then provide an ACPI-independent interface
// to initialize enumerated processors to userspace?
// Parse SRAT BEFORE MADT so NUMA node mapping is available
// when APs are started and PercpuBlocks are created.
srat::init();
Madt::init();
// Parse SLIT after MADT for the NUMA distance matrix.
slit::init();
//TODO: support this on any arch
// SPCR must be initialized after MADT for interrupt controllers
#[cfg(target_arch = "aarch64")]
+31 -3
View File
@@ -17,9 +17,33 @@ pub struct Rsdp {
impl Rsdp {
pub unsafe fn get_rsdp(already_supplied_rsdp: Option<*const u8>) -> Option<Rsdp> {
already_supplied_rsdp.map(|rsdp_ptr| {
// TODO: Validate
unsafe { *(rsdp_ptr as *const Rsdp) }
already_supplied_rsdp.and_then(|rsdp_ptr| {
let rsdp = unsafe { *(rsdp_ptr as *const Rsdp) };
// Validate signature "RSD PTR "
if &rsdp.signature != b"RSD PTR " {
return None;
}
// ACPI 1.0 checksum: sum of first 20 bytes must be zero
let bytes_v1 = unsafe { core::slice::from_raw_parts(rsdp_ptr, 20) };
if bytes_v1.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 {
return None;
}
// ACPI 2.0+ extended checksum: sum of entire table (length bytes) must be zero
if rsdp.revision >= 2 {
let full_len = rsdp._length as usize;
if full_len < 36 || full_len > 256 {
return None;
}
let bytes_full = unsafe { core::slice::from_raw_parts(rsdp_ptr, full_len) };
if bytes_full.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 {
return None;
}
}
Some(rsdp)
})
}
@@ -31,4 +55,8 @@ impl Rsdp {
self.rsdt_address as usize
})
}
pub fn revision(&self) -> u8 {
self.revision
}
}
@@ -24,4 +24,20 @@ impl Sdt {
let header_size = size_of::<Sdt>();
total_size.saturating_sub(header_size)
}
/// Validate the SDT checksum.
///
/// Per ACPI 6.5 §5.2.2: the entire table (including the checksum field)
/// must sum to 0 when all bytes are added together as unsigned 8-bit values.
pub fn validate_checksum(&self) -> bool {
let ptr = self as *const _ as *const u8;
let len = self.length as usize;
if len < size_of::<Sdt>() {
return false;
}
let sum = unsafe { core::slice::from_raw_parts(ptr, len) }
.iter()
.fold(0u8, |acc, &b| acc.wrapping_add(b));
sum == 0
}
}
@@ -0,0 +1,45 @@
//! SLIT (System Locality Information Table) parser.
//!
//! Parses the NUMA distance matrix for scheduler NUMA-aware work stealing.
use super::sdt::Sdt;
use crate::acpi::find_sdt;
const MAX_NODES: usize = 8;
static mut SLIT_MATRIX: [[u8; MAX_NODES]; MAX_NODES] = [[10u8; MAX_NODES]; MAX_NODES];
static mut SLIT_NUM_NODES: usize = 0;
static mut SLIT_AVAILABLE: bool = false;
pub fn is_available() -> bool { unsafe { SLIT_AVAILABLE } }
pub fn num_nodes() -> usize { unsafe { SLIT_NUM_NODES } }
pub fn distance(from: u8, to: u8) -> u8 {
if !unsafe { SLIT_AVAILABLE } { return 10; }
let (from, to) = (from as usize, to as usize);
if from >= MAX_NODES || to >= MAX_NODES { return 10; }
unsafe { SLIT_MATRIX[from][to] }
}
pub fn same_socket(node1: u8, node2: u8) -> bool { distance(node1, node2) <= 20 }
pub fn init() {
let sdt = match find_sdt("SLIT").as_slice() {
[] => return,
[x] => *x,
xs => { println!("SLIT: {} tables found, expected 1", xs.len()); return; }
};
if &sdt.signature != b"SLIT" { return; }
let data_addr = sdt.data_address();
let data_len = sdt.data_len();
if data_len < 8 { return; }
let num_nodes = unsafe { *(data_addr as *const u64) } as usize;
if num_nodes == 0 || num_nodes > MAX_NODES { println!("SLIT: {num_nodes} nodes (max {MAX_NODES}), ignoring"); return; }
let matrix_start = 8;
let matrix_size = num_nodes * num_nodes;
if data_len < matrix_start + matrix_size { println!("SLIT: matrix truncated ({data_len} < {})", matrix_start + matrix_size); return; }
let matrix = unsafe { &mut SLIT_MATRIX };
for i in 0..num_nodes { for j in 0..num_nodes { matrix[i][j] = unsafe { *((data_addr + matrix_start + i * num_nodes + j) as *const u8) }; } }
unsafe { SLIT_NUM_NODES = num_nodes; SLIT_AVAILABLE = true; }
debug!("SLIT: {} nodes, distance matrix loaded", num_nodes);
}
+102
View File
@@ -0,0 +1,102 @@
//! SRAT (System Resource Affinity Table) parser.
//!
//! Parses CPU-to-NUMA-node and memory-to-NUMA-node affinity information.
//! Called before MADT init so that NUMA data is available during AP startup.
use super::sdt::Sdt;
use crate::acpi::find_sdt;
const MAX_CPU_ENTRIES: usize = 256;
const MAX_MEM_ENTRIES: usize = 64;
#[derive(Clone, Copy)]
struct SratCpuEntry { apic_id: u32, node: u8, enabled: bool }
#[derive(Clone, Copy)]
struct SratMemEntry { node: u8, base: u64, length: u64, enabled: bool }
const CPU_NONE: SratCpuEntry = SratCpuEntry { apic_id: u32::MAX, node: 0, enabled: false };
const MEM_NONE: SratMemEntry = SratMemEntry { node: 0, base: 0, length: 0, enabled: false };
static mut SRAT_CPU_ENTRIES: [SratCpuEntry; MAX_CPU_ENTRIES] = [CPU_NONE; MAX_CPU_ENTRIES];
static mut SRAT_MEM_ENTRIES: [SratMemEntry; MAX_MEM_ENTRIES] = [MEM_NONE; MAX_MEM_ENTRIES];
static mut SRAT_CPU_COUNT: usize = 0;
static mut SRAT_MEM_COUNT: usize = 0;
static mut SRAT_AVAILABLE: bool = false;
pub fn is_available() -> bool { unsafe { SRAT_AVAILABLE } }
pub fn numa_node_for_apic(apic_id: u32) -> Option<u8> {
if !unsafe { SRAT_AVAILABLE } { return None; }
let count = unsafe { SRAT_CPU_COUNT };
let entries = unsafe { &SRAT_CPU_ENTRIES };
for i in 0..count {
if entries[i].apic_id == apic_id && entries[i].enabled { return Some(entries[i].node); }
}
None
}
pub fn numa_node_count() -> usize {
if !unsafe { SRAT_AVAILABLE } { return 1; }
let mut max_node: u8 = 0;
let count = unsafe { SRAT_CPU_COUNT };
let entries = unsafe { &SRAT_CPU_ENTRIES };
for i in 0..count { if entries[i].enabled && entries[i].node > max_node { max_node = entries[i].node; } }
(max_node as usize) + 1
}
#[repr(C, packed)]
struct SratLocalApic { _proximity_lo: u8, apic_id: u8, flags: u32, _local_sapic_eid: u8, _proximity_hi: [u8; 3], _clock_domain: u32 }
#[repr(C, packed)]
struct SratMemoryAffinity { proximity_domain: u32, _reserved1: u16, base_address_lo: u32, base_address_hi: u32, length_lo: u32, length_hi: u32, _reserved2: u32, flags: u32, _reserved3: u64 }
#[repr(C, packed)]
struct SratLocalX2Apic { _reserved: u16, proximity_domain: u32, x2apic_id: u32, flags: u32, _clock_domain: u32, _reserved2: u32 }
pub fn init() {
let sdt = match find_sdt("SRAT").as_slice() {
[] => return,
[x] => *x,
xs => { println!("SRAT: {} tables found, expected 1", xs.len()); return; }
};
if &sdt.signature != b"SRAT" { return; }
let data_addr = sdt.data_address();
let data_len = sdt.data_len();
if data_len < 12 { println!("SRAT: table too short ({data_len} bytes)"); return; }
let mut offset: usize = 12;
let cpu_entries = unsafe { &mut SRAT_CPU_ENTRIES };
let mem_entries = unsafe { &mut SRAT_MEM_ENTRIES };
let mut cpu_count: usize = 0;
let mut mem_count: usize = 0;
while offset + 2 <= data_len {
let entry_type = unsafe { *((data_addr + offset) as *const u8) };
let entry_len = unsafe { *((data_addr + offset + 1) as *const u8) } as usize;
if entry_len < 2 || offset + entry_len > data_len { break; }
let entry_data = data_addr + offset + 2;
match entry_type {
0x0 if entry_len >= size_of::<SratLocalApic>() + 2 => {
let e = unsafe { &*(entry_data as *const SratLocalApic) };
let enabled = (e.flags & 1) == 1;
let node = (e._proximity_lo as u32) | ((e._proximity_hi[0] as u32) << 8) | ((e._proximity_hi[1] as u32) << 16) | ((e._proximity_hi[2] as u32) << 24);
if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.apic_id as u32, node: node as u8, enabled }; cpu_count += 1; }
}
0x1 if entry_len >= size_of::<SratMemoryAffinity>() + 2 => {
let e = unsafe { &*(entry_data as *const SratMemoryAffinity) };
let enabled = (e.flags & 1) == 1;
let base = (e.base_address_hi as u64) << 32 | e.base_address_lo as u64;
let length = (e.length_hi as u64) << 32 | e.length_lo as u64;
if mem_count < MAX_MEM_ENTRIES { mem_entries[mem_count] = SratMemEntry { node: e.proximity_domain as u8, base, length, enabled }; mem_count += 1; }
}
0x2 if entry_len >= size_of::<SratLocalX2Apic>() + 2 => {
let e = unsafe { &*(entry_data as *const SratLocalX2Apic) };
let enabled = (e.flags & 1) == 1;
if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.x2apic_id, node: e.proximity_domain as u8, enabled }; cpu_count += 1; }
}
_ => {}
}
offset += entry_len;
}
unsafe { SRAT_CPU_COUNT = cpu_count; SRAT_MEM_COUNT = mem_count; SRAT_AVAILABLE = true; }
debug!("SRAT: {} CPU entries, {} memory entries", cpu_count, mem_count);
}
+27 -13
View File
@@ -7,26 +7,40 @@ mod linked_list;
/// Size of kernel heap
const KERNEL_HEAP_SIZE: usize = ::rmm::MEGABYTE;
#[cold]
fn halt_kernel_heap_init(message: &str) -> ! {
print!("{message}");
println!("Kernel heap initialization cannot continue. Halting.");
loop {
core::hint::spin_loop();
}
}
unsafe fn map_heap(mapper: &mut KernelMapper<true>, offset: usize, size: usize) {
let mut flush_all = PageFlushAll::new();
let heap_start_page = Page::containing_address(VirtualAddress::new(offset));
let heap_end_page = Page::containing_address(VirtualAddress::new(offset + size - 1));
for page in Page::range_inclusive(heap_start_page, heap_end_page) {
let phys = mapper
.allocator_mut()
.allocate_one()
.expect("failed to allocate kernel heap");
let phys = match mapper.allocator_mut().allocate_one() {
Some(phys) => phys,
None => halt_kernel_heap_init(
"FATAL: failed to allocate physical frame for kernel heap\n",
),
};
let flush = unsafe {
mapper
.map_phys(
page.start_address(),
phys,
PageFlags::new()
.write(true)
.global(cfg!(not(feature = "pti"))),
)
.expect("failed to map kernel heap")
match mapper.map_phys(
page.start_address(),
phys,
PageFlags::new()
.write(true)
.global(cfg!(not(feature = "pti"))),
) {
Some(flush) => flush,
None => halt_kernel_heap_init(
"FATAL: failed to map kernel heap virtual page\n",
),
}
};
flush_all.consume(flush);
}
@@ -91,7 +91,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! {
dtb::serial::init_early(dtb);
}
info!("Redox OS starting...");
info!("RedBear OS starting...");
args.print();
// Initialize RMM
@@ -97,7 +97,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! {
init_early(dtb);
}
info!("Redox OS starting...");
info!("RedBear OS starting...");
args.print();
if let Some(dtb) = &dtb {
@@ -14,6 +14,10 @@ pub struct IoApicRegs {
pointer: *const u32,
}
impl IoApicRegs {
fn redirection_index_valid(&mut self, idx: u8) -> bool {
idx <= self.max_redirection_table_entries()
}
fn ioregsel(&self) -> *const u32 {
self.pointer
}
@@ -44,21 +48,28 @@ impl IoApicRegs {
pub fn read_ioapicver(&mut self) -> u32 {
self.read_reg(0x01)
}
pub fn read_ioredtbl(&mut self, idx: u8) -> u64 {
assert!(idx < 24);
pub fn read_ioredtbl(&mut self, idx: u8) -> Option<u64> {
if !self.redirection_index_valid(idx) {
warn!("IOAPIC read_ioredtbl index {} out of range", idx);
return None;
}
let lo = self.read_reg(0x10 + idx * 2);
let hi = self.read_reg(0x10 + idx * 2 + 1);
u64::from(lo) | (u64::from(hi) << 32)
Some(u64::from(lo) | (u64::from(hi) << 32))
}
pub fn write_ioredtbl(&mut self, idx: u8, value: u64) {
assert!(idx < 24);
pub fn write_ioredtbl(&mut self, idx: u8, value: u64) -> bool {
if !self.redirection_index_valid(idx) {
warn!("IOAPIC write_ioredtbl index {} out of range", idx);
return false;
}
let lo = value as u32;
let hi = (value >> 32) as u32;
self.write_reg(0x10 + idx * 2, lo);
self.write_reg(0x10 + idx * 2 + 1, hi);
true
}
pub fn max_redirection_table_entries(&mut self) -> u8 {
@@ -92,17 +103,37 @@ impl IoApic {
}
/// Map an interrupt vector to a physical local APIC ID of a processor (thus physical mode).
#[allow(dead_code)]
pub fn map(&self, idx: u8, info: MapInfo) {
self.regs.lock().write_ioredtbl(idx, info.as_raw())
pub fn map(&self, idx: u8, info: MapInfo) -> bool {
let Some(raw) = info.as_raw() else {
return false;
};
self.regs.lock().write_ioredtbl(idx, raw)
}
pub fn set_mask(&self, gsi: u32, mask: bool) {
let idx = (gsi - self.gsi_start) as u8;
let mut guard = self.regs.lock();
let mut reg = guard.read_ioredtbl(idx);
let Some(mut reg) = guard.read_ioredtbl(idx) else {
return;
};
reg &= !(1 << 16);
reg |= u64::from(mask) << 16;
guard.write_ioredtbl(idx, reg);
let _ = guard.write_ioredtbl(idx, reg);
}
/// Change the destination APIC for a GSI by reprogramming the redirection table entry.
/// Preserves all other fields (vector, polarity, trigger mode, delivery mode, mask).
/// Returns true if the entry was successfully updated.
pub fn set_irq_affinity(&self, gsi: u32, dest: ApicId) -> bool {
let idx = (gsi - self.gsi_start) as u8;
let mut guard = self.regs.lock();
let Some(mut entry) = guard.read_ioredtbl(idx) else {
return false;
};
// Clear destination field (bits 63:56 for xAPIC physical mode)
// and set new destination APIC ID
entry &= !(0xFF_u64 << 56);
entry |= u64::from(dest.get()) << 56;
guard.write_ioredtbl(idx, entry)
}
}
@@ -149,19 +180,26 @@ pub struct MapInfo {
}
impl MapInfo {
pub fn as_raw(&self) -> u64 {
assert!(self.vector >= 0x20);
assert!(self.vector <= 0xFE);
pub fn as_raw(&self) -> Option<u64> {
if !(0x20..=0xFE).contains(&self.vector) {
warn!(
"Refusing to map IOAPIC vector outside valid range: {:#x}",
self.vector
);
return None;
}
// TODO: Check for reserved fields.
(u64::from(self.dest.get()) << 56)
Some(
(u64::from(self.dest.get()) << 56)
| (u64::from(self.mask) << 16)
| ((self.trigger_mode as u64) << 15)
| ((self.polarity as u64) << 13)
| ((self.dest_mode as u64) << 11)
| ((self.delivery_mode as u64) << 8)
| u64::from(self.vector)
| u64::from(self.vector),
)
}
}
@@ -175,7 +213,7 @@ impl fmt::Debug for IoApic {
let count = guard.max_redirection_table_entries();
f.debug_list()
.entries((0..count).map(|i| guard.read_ioredtbl(i)))
.entries((0..=count).filter_map(|i| guard.read_ioredtbl(i)))
.finish()
}
}
@@ -237,11 +275,14 @@ pub unsafe fn handle_ioapic(madt_ioapic: &'static MadtIoApic) {
let ioapic_registers = virt.data() as *const u32;
let ioapic = IoApic::new(ioapic_registers, madt_ioapic.gsi_base);
assert_eq!(
ioapic.regs.lock().id(),
madt_ioapic.id,
"mismatched ACPI MADT I/O APIC ID, and the ID reported by the I/O APIC"
);
let detected_id = ioapic.regs.lock().id();
if detected_id != madt_ioapic.id {
warn!(
"mismatched ACPI MADT I/O APIC ID: MADT={}, IOAPIC={}; continuing with detected hardware",
madt_ioapic.id,
detected_id
);
}
(*IOAPICS.get()).get_or_insert_with(Vec::new).push(ioapic);
}
@@ -310,11 +351,11 @@ pub unsafe fn init() {
}
}
}
println!(
"I/O APICs: {:?}, overrides: {:?}",
ioapics(),
src_overrides()
);
for ioapic in ioapics() {
for idx in 0..=ioapic.count {
ioapic.set_mask(ioapic.gsi_start + u32::from(idx), true);
}
}
// map the legacy PC-compatible IRQs (0-15) to 32-47, just like we did with 8259 PIC (if it
// wouldn't have been disabled due to this I/O APIC)
@@ -329,7 +370,6 @@ pub unsafe fn init() {
.iter()
.any(|over| over.bus_irq == legacy_irq)
{
// there's an IRQ conflict, making this legacy IRQ inaccessible.
continue;
}
(
@@ -349,7 +389,6 @@ pub unsafe fn init() {
let redir_tbl_index = (gsi - apic.gsi_start) as u8;
let map_info = MapInfo {
// only send to the BSP
dest: bsp_apic_id,
dest_mode: DestinationMode::Physical,
delivery_mode: DeliveryMode::Fixed,
@@ -366,7 +405,32 @@ pub unsafe fn init() {
},
vector: 32 + legacy_irq,
};
apic.map(redir_tbl_index, map_info);
if !apic.map(redir_tbl_index, map_info) {
warn!(
"Unable to map legacy IRQ {} (GSI {}) through IOAPIC index {}",
legacy_irq,
gsi,
redir_tbl_index
);
}
if legacy_irq == 0 && gsi != u32::from(legacy_irq) {
if let Some(apic0) = find_ioapic(u32::from(legacy_irq)) {
let idx0 = (u32::from(legacy_irq) - apic0.gsi_start) as u8;
let _ = apic0.map(
idx0,
MapInfo {
dest: bsp_apic_id,
dest_mode: DestinationMode::Physical,
delivery_mode: DeliveryMode::Fixed,
mask: false,
polarity: ApicPolarity::ActiveHigh,
trigger_mode: ApicTriggerMode::Edge,
vector: 32,
},
);
}
}
}
println!(
"I/O APICs: {:?}, overrides: {:?}",
@@ -406,7 +470,7 @@ fn resolve(irq: u8) -> u32 {
fn find_ioapic(gsi: u32) -> Option<&'static IoApic> {
ioapics()
.iter()
.find(|apic| gsi >= apic.gsi_start && gsi < apic.gsi_start + u32::from(apic.count))
.find(|apic| gsi >= apic.gsi_start && gsi <= apic.gsi_start + u32::from(apic.count))
}
pub unsafe fn mask(irq: u8) {
@@ -425,3 +489,14 @@ pub unsafe fn unmask(irq: u8) {
};
apic.set_mask(gsi, false);
}
/// Change the destination CPU for an IRQ by reprogramming the IOAPIC redirection entry.
/// Resolves the legacy IRQ to its GSI, finds the owning IOAPIC, and updates the destination
/// APIC ID in the redirection table while preserving all other fields.
pub unsafe fn set_affinity(irq: u8, dest: ApicId) -> bool {
let gsi = resolve(irq);
match find_ioapic(gsi) {
Some(apic) => apic.set_irq_affinity(gsi, dest),
None => false,
}
}
@@ -4,9 +4,11 @@ pub mod cpu;
pub mod hpet;
pub mod ioapic;
pub mod local_apic;
pub mod msi;
pub mod pic;
pub mod pit;
pub mod serial;
pub mod vector;
#[cfg(feature = "system76_ec_debug")]
pub mod system76_ec;
@@ -23,8 +25,7 @@ pub unsafe fn init() {
}
}
pub unsafe fn init_after_acpi() {
// this will disable the IOAPIC if needed.
//ioapic::init(mapper);
unsafe { ioapic::init() };
}
unsafe fn init_hpet() -> bool {
@@ -0,0 +1,183 @@
// MSI/MSI-X support for x86 — kernel-level message composition and validation
// Cross-referenced from Linux 7.0: arch/x86/kernel/apic/msi.c (391 lines)
use crate::arch::device::local_apic::ApicId;
pub const MSI_ADDRESS_BASE: u64 = 0xFEE0_0000;
pub const MSI_ADDRESS_MASK: u64 = 0xFEEF_F000;
const MSI_DEST_MODE_LOGICAL: u64 = 1 << 2;
const MSI_REDIRECTION_HINT: u64 = 1 << 3;
#[derive(Debug, Clone, Copy)]
pub struct MsiAddress {
pub raw: u64,
}
#[derive(Debug, Clone, Copy)]
pub struct MsiData {
pub raw: u32,
}
#[derive(Debug, Clone)]
pub struct MsiMessage {
pub address: MsiAddress,
pub data: MsiData,
}
impl MsiAddress {
pub fn new(dest_apic_id: u8, redirection_hint: bool, dest_mode_logical: bool) -> Self {
let mut addr = MSI_ADDRESS_BASE;
addr |= u64::from(dest_apic_id) << 12;
if redirection_hint {
addr |= MSI_REDIRECTION_HINT;
}
if dest_mode_logical {
addr |= MSI_DEST_MODE_LOGICAL;
}
Self { raw: addr }
}
pub fn validate(addr: u64) -> bool {
(addr & MSI_ADDRESS_MASK) == MSI_ADDRESS_BASE
}
pub fn dest_apic_id(&self) -> u8 {
((self.raw >> 12) & 0xFF) as u8
}
}
impl MsiData {
pub fn new(vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self {
let mut data = u32::from(vector);
data |= u32::from(delivery_mode) << 8;
data |= u32::from(trigger_mode) << 15;
Self { raw: data }
}
pub fn vector(&self) -> u8 {
(self.raw & 0xFF) as u8
}
pub fn delivery_mode(&self) -> u8 {
((self.raw >> 8) & 0x7) as u8
}
pub fn trigger_mode(&self) -> u8 {
((self.raw >> 15) & 0x1) as u8
}
}
impl MsiMessage {
pub fn compose(dest: ApicId, vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self {
let address = MsiAddress::new(dest.get() as u8, false, false);
let data = MsiData::new(vector, delivery_mode, trigger_mode);
Self { address, data }
}
pub fn validate(&self) -> bool {
MsiAddress::validate(self.address.raw)
&& self.data.vector() >= 32
&& self.data.vector() < 255
}
}
pub fn is_valid_msi_address(addr: u64) -> bool {
MsiAddress::validate(addr)
}
pub fn is_valid_msi_vector(vector: u8) -> bool {
vector >= 32 && vector < 255
}
#[derive(Debug)]
pub struct MsiCapability {
pub msg_ctl: u16,
pub msg_addr_lo: u32,
pub msg_addr_hi: u32,
pub msg_data: u16,
pub mask_bits: u32,
pub pending_bits: u32,
pub is_64bit: bool,
pub is_maskable: bool,
pub multiple_message_capable: u8,
}
impl MsiCapability {
pub fn parse(raw: &[u32; 6], msg_ctl: u16) -> Self {
Self {
msg_ctl,
msg_addr_lo: raw[1],
msg_addr_hi: if msg_ctl & (1 << 7) != 0 { raw[2] } else { 0 },
msg_data: if msg_ctl & (1 << 7) != 0 {
(raw[3] & 0xFFFF) as u16
} else {
(raw[2] & 0xFFFF) as u16
},
mask_bits: if msg_ctl & (1 << 8) != 0 {
if msg_ctl & (1 << 7) != 0 {
raw[3] >> 16
} else {
raw[3]
}
} else {
0
},
pending_bits: if msg_ctl & (1 << 8) != 0 { raw[4] } else { 0 },
is_64bit: msg_ctl & (1 << 7) != 0,
is_maskable: msg_ctl & (1 << 8) != 0,
multiple_message_capable: ((msg_ctl >> 1) & 0x7) as u8,
}
}
}
#[derive(Debug)]
pub struct MsixCapability {
pub msg_ctl: u16,
pub table_offset: u32,
pub table_bar: u8,
pub pba_offset: u32,
pub pba_bar: u8,
pub table_size: u16,
}
impl MsixCapability {
pub fn parse(raw: &[u32; 3], msg_ctl: u16) -> Self {
Self {
msg_ctl,
table_offset: raw[1] & !0x7,
table_bar: (raw[1] & 0x7) as u8,
pba_offset: raw[2] & !0x7,
pba_bar: (raw[2] & 0x7) as u8,
table_size: ((msg_ctl >> 1) & 0x7FF) as u16 + 1,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compose_message() {
let msg = MsiMessage::compose(ApicId::new(3), 48, 0b101, 1);
assert!(msg.validate());
assert_eq!(msg.address.dest_apic_id(), 3);
assert_eq!(msg.data.vector(), 48);
assert_eq!(msg.data.delivery_mode(), 0b101);
assert_eq!(msg.data.trigger_mode(), 1);
}
#[test]
fn test_invalid_address() {
assert!(!is_valid_msi_address(0xDEAD_BEEF));
assert!(is_valid_msi_address(0xFEE0_0000));
}
#[test]
fn test_msi_parse() {
let raw = [0u32; 6];
let cap = MsiCapability::parse(&raw, 0);
assert!(!cap.is_64bit);
assert!(!cap.is_maskable);
}
}
@@ -0,0 +1,53 @@
use crate::cpu_set::LogicalCpuId;
const VECTOR_COUNT: usize = 224;
static VECTORS: [core::sync::atomic::AtomicU32; 7] = [
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
];
pub fn allocate_vector(_cpu: LogicalCpuId) -> Option<u8> {
for (bank, slot) in VECTORS.iter().enumerate() {
let mut bits = slot.load(core::sync::atomic::Ordering::Acquire);
loop {
let free = bits.trailing_ones() as usize;
if free >= 32 {
break;
}
let bit = 1u32 << free;
match slot.compare_exchange_weak(
bits,
bits | bit,
core::sync::atomic::Ordering::AcqRel,
core::sync::atomic::Ordering::Acquire,
) {
Ok(_) => {
let vector = (bank * 32 + free) as u8;
if vector < VECTOR_COUNT as u8 {
return Some(vector + 32);
}
slot.fetch_and(!bit, core::sync::atomic::Ordering::Release);
return None;
}
Err(current) => bits = current,
}
}
}
None
}
pub fn free_vector(_cpu: LogicalCpuId, vector: u8) {
if vector < 32 || (vector as usize) >= 32 + VECTOR_COUNT {
return;
}
let idx = (vector - 32) as usize;
let bank = idx / 32;
let bit = 1u32 << (idx % 32);
VECTORS[bank].fetch_and(!bit, core::sync::atomic::Ordering::Release);
}
@@ -192,6 +192,15 @@ impl ProcessorControlRegion {
}
}
#[cold]
fn halt_pcr_init() -> ! {
println!("FATAL: failed to allocate physical memory for Processor Control Region");
println!("Processor startup cannot continue. Halting.");
loop {
core::hint::spin_loop();
}
}
pub unsafe fn pcr() -> *mut ProcessorControlRegion {
unsafe {
// Primitive benchmarking of RDFSBASE and RDGSBASE in userspace, appears to indicate that
@@ -375,7 +384,10 @@ pub fn allocate_and_init_pcr(
.next_power_of_two()
.trailing_zeros();
let pcr_frame = crate::memory::allocate_p2frame(alloc_order).expect("failed to allocate PCR");
let pcr_frame = match crate::memory::allocate_p2frame(alloc_order) {
Some(frame) => frame,
None => halt_pcr_init(),
};
let pcr_ptr = RmmA::phys_to_virt(pcr_frame.base()).data() as *mut ProcessorControlRegion;
unsafe { core::ptr::write(pcr_ptr, ProcessorControlRegion::new_partial_init(cpu_id)) };
@@ -78,6 +78,15 @@ static INIT_BSP_IDT: SyncUnsafeCell<Idt> = SyncUnsafeCell::new(Idt::new());
pub(crate) static IDTS: RwLock<HashMap<LogicalCpuId, &'static mut Idt>> =
RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));
#[cold]
fn halt_idt_init() -> ! {
println!("FATAL: failed to allocate physical pages for backup interrupt stack");
println!("Interrupt setup cannot continue. Halting.");
loop {
core::hint::spin_loop();
}
}
#[inline]
pub fn is_reserved(cpu_id: LogicalCpuId, index: u8) -> bool {
if cpu_id == LogicalCpuId::BSP {
@@ -101,6 +110,8 @@ pub fn set_reserved(cpu_id: LogicalCpuId, index: u8, reserved: bool) {
}
pub fn available_irqs_iter(cpu_id: LogicalCpuId) -> impl Iterator<Item = u8> + 'static {
let count = (32..=254).filter(|&index| !is_reserved(cpu_id, index)).count();
info!("available_irqs_iter: cpu_id={} count={}", cpu_id.get(), count);
(32..=254).filter(move |&index| !is_reserved(cpu_id, index))
}
@@ -161,8 +172,10 @@ pub fn allocate_and_init_idt(cpu_id: LogicalCpuId) -> *mut Idt {
.or_insert_with(|| Box::leak(Box::new(Idt::new())));
use crate::memory::{RmmA, RmmArch};
let frames = crate::memory::allocate_p2frame(4)
.expect("failed to allocate pages for backup interrupt stack");
let frames = match crate::memory::allocate_p2frame(4) {
Some(frames) => frames,
None => halt_idt_init(),
};
// Physical pages are mapped linearly. So is the linearly mapped virtual memory.
let base_address = RmmA::phys_to_virt(frames.base());
@@ -1,3 +1,5 @@
use core::sync::atomic::{AtomicBool, Ordering};
use syscall::Exception;
use x86::irq::PageFaultError;
@@ -10,6 +12,22 @@ use crate::{
syscall::flag::*,
};
static NMI_IN_PROGRESS: AtomicBool = AtomicBool::new(false);
unsafe fn nmi_raw_serial_write(bytes: &[u8]) {
use crate::syscall::io::{Io, Pio};
let mut com1 = Pio::<u8>::new(0x3F8);
let lsr = Pio::<u8>::new(0x3F8 + 5);
for &byte in bytes {
while lsr.read() & (1 << 5) == 0 {
core::hint::spin_loop();
}
com1.write(byte);
}
}
interrupt_stack!(divide_by_zero, |stack| {
println!("Divide by zero");
stack.trace();
@@ -55,9 +73,35 @@ interrupt_stack!(non_maskable, @paranoid, |stack| {
#[cfg(not(all(target_arch = "x86_64", feature = "profiling")))]
{
// TODO: This will likely deadlock
println!("Non-maskable interrupt");
stack.dump();
if NMI_IN_PROGRESS.swap(true, Ordering::SeqCst) {
return;
}
unsafe {
nmi_raw_serial_write(b"Non-maskable interrupt\n");
nmi_raw_serial_write(b" RIP: ");
#[cfg(target_arch = "x86")]
let instruction_pointer = u64::from(stack.iret.eip);
#[cfg(target_arch = "x86_64")]
let instruction_pointer = stack.iret.rip;
let mut buf = [0u8; 19];
buf[0] = b'0';
buf[1] = b'x';
for i in 0..16 {
let nibble = ((instruction_pointer >> (60 - i * 4)) & 0xF) as u8;
buf[2 + i] = if nibble < 10 {
b'0' + nibble
} else {
b'a' + nibble - 10
};
}
buf[18] = b'\n';
nmi_raw_serial_write(&buf);
}
NMI_IN_PROGRESS.store(false, Ordering::SeqCst);
}
});
@@ -28,6 +28,8 @@ pub mod pti;
/// Initialization and start function
pub mod start;
pub mod sleep;
/// Stop function
pub mod stop;
@@ -0,0 +1,712 @@
use alloc::{sync::Arc, vec::Vec};
use core::{
ptr::NonNull,
str::FromStr,
sync::atomic::{AtomicU32, Ordering},
};
use acpi_ext::{
aml::{namespace::AmlName, object::Object, Interpreter},
registers::FixedRegisters,
sdt::{facs::Facs, fadt::Fadt, SdtHeader},
AcpiTables, Handle, Handler, PhysicalMapping,
};
use spin::Mutex;
use syscall::error::{Error, EINVAL, EIO};
use x86::{segmentation::SegmentSelector, task, Ring};
use crate::{
acpi::ACPI_ROOT_INFO,
arch::interrupt,
memory::{
round_down_pages, round_up_pages, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA,
RmmArch, VirtualAddress, PAGE_SIZE,
},
syscall::io::{Io, Pio},
};
const ACPI_SLP_TYP_SHIFT: u16 = 10;
const ACPI_SLP_TYP_MASK: u16 = 0x1C00;
const ACPI_SLP_EN: u16 = 1 << 13;
const WAKE_TRAMPOLINE_PHYS: usize = 0x8000;
const SLEEP_RETURN_OK: usize = 0;
#[cfg(target_arch = "x86_64")]
static WAKE_TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/s3_wakeup"));
#[repr(C, packed)]
#[derive(Clone, Copy, Debug, Default)]
struct DescriptorTableRegister {
limit: u16,
base: u64,
}
#[repr(C, align(64))]
#[derive(Clone, Copy, Debug)]
struct FpuState {
bytes: [u8; 4096],
}
impl Default for FpuState {
fn default() -> Self {
Self { bytes: [0; 4096] }
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum SleepState {
S3,
S5,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum SleepError {
UnsupportedArch,
MissingAcpi,
MissingFadt,
MissingFacs,
MissingSleepObject,
InvalidSleepObject,
UnsupportedPmControl,
UnsupportedAmlOperation,
SleepDidNotEnter,
}
impl SleepError {
fn code(self) -> usize {
match self {
Self::UnsupportedArch => EINVAL as usize,
Self::MissingAcpi
| Self::MissingFadt
| Self::MissingFacs
| Self::MissingSleepObject
| Self::UnsupportedAmlOperation => EIO as usize,
Self::InvalidSleepObject | Self::UnsupportedPmControl | Self::SleepDidNotEnter => {
EINVAL as usize
}
}
}
fn from_code(code: usize) -> Self {
match code as i32 {
x if x == EINVAL => Self::InvalidSleepObject,
_ => Self::MissingAcpi,
}
}
}
#[derive(Clone, Copy, Debug, Default)]
struct SavedCpuContext {
entry_rsp: usize,
runtime_rsp: usize,
facs_address: usize,
cr0: usize,
cr2: usize,
cr3: usize,
cr4: usize,
rflags: usize,
gdtr: DescriptorTableRegister,
idtr: DescriptorTableRegister,
efer: u64,
fs_base: u64,
gs_base: u64,
kernel_gs_base: u64,
fpu: FpuState,
}
static SAVED_CONTEXT: Mutex<Option<SavedCpuContext>> = Mutex::new(None);
static AML_MUTEX_IDS: AtomicU32 = AtomicU32::new(1);
#[derive(Clone, Copy, Debug)]
struct SleepTypeData {
a: u16,
b: u16,
}
#[derive(Clone, Copy)]
struct KernelAcpiHandler;
impl KernelAcpiHandler {
fn map_range(physical_address: usize, size: usize) -> (*mut u8, usize) {
let map_base = round_down_pages(physical_address);
let map_offset = physical_address - map_base;
let mapped_length = round_up_pages(size + map_offset);
// SAFETY: The ACPI interpreter only requests firmware-described physical regions.
unsafe {
let mut mapper = KernelMapper::lock_rw();
for page_index in 0..mapped_length / PAGE_SIZE {
let (_, flush) = mapper
.map_linearly(
PhysicalAddress::new(map_base + page_index * PAGE_SIZE),
PageFlags::new(),
)
.expect("failed to linearly map ACPI physical region");
flush.flush();
}
}
let virtual_base = RmmA::phys_to_virt(PhysicalAddress::new(map_base)).data();
((virtual_base + map_offset) as *mut u8, mapped_length)
}
}
impl Handler for KernelAcpiHandler {
unsafe fn map_physical_region<T>(&self, physical_address: usize, size: usize) -> PhysicalMapping<Self, T> {
let (virtual_start, mapped_length) = Self::map_range(physical_address, size);
PhysicalMapping {
physical_start: physical_address,
virtual_start: NonNull::new(virtual_start.cast::<T>())
.expect("expected mapped ACPI virtual address to be non-null"),
region_length: size,
mapped_length,
handler: *self,
}
}
fn unmap_physical_region<T>(_region: &PhysicalMapping<Self, T>) {}
fn read_u8(&self, address: usize) -> u8 {
// SAFETY: AML system-memory accesses are byte-addressable firmware regions.
unsafe { core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u8) }
}
fn read_u16(&self, address: usize) -> u16 {
// SAFETY: AML system-memory accesses are word-addressable firmware regions.
unsafe {
core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u16)
}
}
fn read_u32(&self, address: usize) -> u32 {
// SAFETY: AML system-memory accesses are dword-addressable firmware regions.
unsafe {
core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u32)
}
}
fn read_u64(&self, address: usize) -> u64 {
// SAFETY: AML system-memory accesses are qword-addressable firmware regions.
unsafe {
core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u64)
}
}
fn write_u8(&self, address: usize, value: u8) {
// SAFETY: AML system-memory accesses are byte-addressable firmware regions.
unsafe {
core::ptr::write_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u8, value)
}
}
fn write_u16(&self, address: usize, value: u16) {
// SAFETY: AML system-memory accesses are word-addressable firmware regions.
unsafe {
core::ptr::write_volatile(
RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u16,
value,
)
}
}
fn write_u32(&self, address: usize, value: u32) {
// SAFETY: AML system-memory accesses are dword-addressable firmware regions.
unsafe {
core::ptr::write_volatile(
RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u32,
value,
)
}
}
fn write_u64(&self, address: usize, value: u64) {
// SAFETY: AML system-memory accesses are qword-addressable firmware regions.
unsafe {
core::ptr::write_volatile(
RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u64,
value,
)
}
}
fn read_io_u8(&self, port: u16) -> u8 {
Pio::<u8>::new(port).read()
}
fn read_io_u16(&self, port: u16) -> u16 {
Pio::<u16>::new(port).read()
}
fn read_io_u32(&self, port: u16) -> u32 {
Pio::<u32>::new(port).read()
}
fn write_io_u8(&self, port: u16, value: u8) {
Pio::<u8>::new(port).write(value)
}
fn write_io_u16(&self, port: u16, value: u16) {
Pio::<u16>::new(port).write(value)
}
fn write_io_u32(&self, port: u16, value: u32) {
Pio::<u32>::new(port).write(value)
}
fn read_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u8 {
0
}
fn read_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u16 {
0
}
fn read_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u32 {
0
}
fn write_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u8) {}
fn write_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u16) {}
fn write_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u32) {}
fn nanos_since_boot(&self) -> u64 {
0
}
fn stall(&self, microseconds: u64) {
for _ in 0..(microseconds.saturating_mul(64)) {
core::hint::spin_loop();
}
}
fn sleep(&self, milliseconds: u64) {
for _ in 0..(milliseconds.saturating_mul(64_000)) {
core::hint::spin_loop();
}
}
fn create_mutex(&self) -> Handle {
Handle(AML_MUTEX_IDS.fetch_add(1, Ordering::Relaxed))
}
fn acquire(&self, _mutex: Handle, _timeout: u16) -> Result<(), acpi_ext::aml::AmlError> {
Ok(())
}
fn release(&self, _mutex: Handle) {}
}
fn sleep_state_name(state: SleepState) -> &'static str {
match state {
SleepState::S3 => "\\_S3",
SleepState::S5 => "\\_S5",
}
}
fn encode_sleep_type(value: u16) -> u16 {
if value <= 0x7 {
value << ACPI_SLP_TYP_SHIFT
} else {
value & ACPI_SLP_TYP_MASK
}
}
fn load_interpreter() -> Result<(
Arc<FixedRegisters<KernelAcpiHandler>>,
PhysicalMapping<KernelAcpiHandler, Facs>,
Interpreter<KernelAcpiHandler>,
), SleepError> {
let root = *ACPI_ROOT_INFO.get().ok_or(SleepError::MissingAcpi)?;
let handler = KernelAcpiHandler;
// SAFETY: ACPI root info is captured from the firmware-provided, already validated root table.
let tables = unsafe {
AcpiTables::from_rsdt(handler, root.revision, root.root_sdt_address.data())
.map_err(|_| SleepError::MissingAcpi)?
};
let fadt = tables.find_table::<Fadt>().ok_or(SleepError::MissingFadt)?;
let registers = Arc::new(
FixedRegisters::new(&fadt, handler).map_err(|_| SleepError::UnsupportedPmControl)?,
);
let facs_address = fadt.facs_address().map_err(|_| SleepError::MissingFacs)?;
// SAFETY: The FADT-supplied FACS address is used exactly as described by the ACPI spec.
let facs = unsafe { handler.map_physical_region::<Facs>(facs_address, core::mem::size_of::<Facs>()) };
// SAFETY: The AML interpreter only needs an owned mapping of the same firmware FACS table.
let interpreter_facs = unsafe {
handler.map_physical_region::<Facs>(facs_address, core::mem::size_of::<Facs>())
};
let dsdt = tables.dsdt().map_err(|_| SleepError::MissingFadt)?;
let interpreter = Interpreter::new(handler, dsdt.revision, Arc::clone(&registers), Some(interpreter_facs));
// SAFETY: Each AML table mapping is owned by the interpreter during table loading.
unsafe {
let mapping = handler.map_physical_region::<SdtHeader>(dsdt.phys_address, dsdt.length as usize);
let stream = core::slice::from_raw_parts(
mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::<SdtHeader>()) as *const u8,
dsdt.length as usize - core::mem::size_of::<SdtHeader>(),
);
interpreter
.load_table(stream)
.map_err(|_| SleepError::UnsupportedAmlOperation)?;
for ssdt in tables.ssdts() {
let mapping = handler.map_physical_region::<SdtHeader>(ssdt.phys_address, ssdt.length as usize);
let stream = core::slice::from_raw_parts(
mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::<SdtHeader>()) as *const u8,
ssdt.length as usize - core::mem::size_of::<SdtHeader>(),
);
interpreter
.load_table(stream)
.map_err(|_| SleepError::UnsupportedAmlOperation)?;
}
}
Ok((registers, facs, interpreter))
}
fn sleep_type_data_from_interpreter(
interpreter: &Interpreter<KernelAcpiHandler>,
state: SleepState,
) -> Result<SleepTypeData, SleepError> {
let name = AmlName::from_str(sleep_state_name(state)).map_err(|_| SleepError::MissingSleepObject)?;
let object = interpreter
.evaluate(name, Vec::new())
.map_err(|_| SleepError::MissingSleepObject)?;
let Object::Package(package) = &*object else {
return Err(SleepError::InvalidSleepObject);
};
let Some(typa_object) = package.first() else {
return Err(SleepError::InvalidSleepObject);
};
let Some(typb_object) = package.get(1) else {
return Err(SleepError::InvalidSleepObject);
};
let Object::Integer(typa) = &**typa_object else {
return Err(SleepError::InvalidSleepObject);
};
let Object::Integer(typb) = &**typb_object else {
return Err(SleepError::InvalidSleepObject);
};
Ok(SleepTypeData {
a: encode_sleep_type(*typa as u16),
b: encode_sleep_type(*typb as u16),
})
}
fn sleep_type_data(state: SleepState) -> Result<SleepTypeData, SleepError> {
let (_registers, _facs, interpreter) = load_interpreter()?;
sleep_type_data_from_interpreter(&interpreter, state)
}
fn install_wake_trampoline(stack_rsp: usize, cr3: usize) {
let trampoline_page = Page::containing_address(VirtualAddress::new(WAKE_TRAMPOLINE_PHYS));
let trampoline_frame = PhysicalAddress::new(WAKE_TRAMPOLINE_PHYS);
// SAFETY: The 0x8000 low-memory trampoline page is reserved by the kernel for bootstrap stubs.
let (result, _) = unsafe {
let mut mapper = KernelMapper::lock_rw();
let result = mapper
.map_phys(
trampoline_page.start_address(),
trampoline_frame,
PageFlags::new().execute(true).write(true),
)
.expect("failed to map S3 wake trampoline page");
(result, mapper.table().phys().data())
};
result.flush();
for (index, value) in WAKE_TRAMPOLINE_DATA.iter().enumerate() {
// SAFETY: The trampoline page is mapped writable at the same virtual address as the physical page.
unsafe {
core::ptr::write_volatile((WAKE_TRAMPOLINE_PHYS as *mut u8).add(index), *value);
}
}
// SAFETY: The wake trampoline layout reserves three qword fields immediately after the jump.
unsafe {
let stack_slot = (WAKE_TRAMPOLINE_PHYS + 8) as *mut u64;
let page_table_slot = stack_slot.add(1);
let code_slot = stack_slot.add(2);
stack_slot.write(stack_rsp as u64);
page_table_slot.write(cr3 as u64);
#[expect(clippy::fn_to_numeric_cast)]
code_slot.write(resume_from_s3_trampoline as usize as u64);
}
// SAFETY: The trampoline mapping is no longer needed once the physical page has been populated.
let (_frame, _, flush) = unsafe {
KernelMapper::lock_rw()
.unmap_phys(trampoline_page.start_address())
.expect("failed to unmap S3 wake trampoline page")
};
flush.flush();
}
fn save_descriptor_tables(context: &mut SavedCpuContext) {
// SAFETY: SGDT/SIDT only read the current CPU descriptor-table registers into the provided storage.
unsafe {
core::arch::asm!("sgdt [{}]", in(reg) &mut context.gdtr, options(nostack, preserves_flags));
core::arch::asm!("sidt [{}]", in(reg) &mut context.idtr, options(nostack, preserves_flags));
}
}
fn save_fpu_state(context: &mut SavedCpuContext) {
// SAFETY: The kernel owns the current CPU at suspend entry and the FXSAVE buffer is 64-byte aligned.
unsafe {
core::arch::asm!(
"fxsave64 [{}]",
in(reg) context.fpu.bytes.as_mut_ptr(),
);
}
}
fn restore_fpu_state(context: &SavedCpuContext) {
// SAFETY: The saved FXSAVE image belongs to the same CPU context and matches the restore instruction.
unsafe {
core::arch::asm!(
"fxrstor64 [{}]",
in(reg) context.fpu.bytes.as_ptr(),
);
}
}
fn save_cpu_context(entry_rsp: usize) -> SavedCpuContext {
let mut context = SavedCpuContext {
entry_rsp,
..SavedCpuContext::default()
};
// SAFETY: Reading control registers and MSRs is required to reconstruct the CPU execution state on wake.
unsafe {
core::arch::asm!(
"mov {}, cr0",
out(reg) context.cr0,
options(nostack, preserves_flags)
);
core::arch::asm!(
"mov {}, cr2",
out(reg) context.cr2,
options(nostack, preserves_flags)
);
core::arch::asm!(
"mov {}, cr3",
out(reg) context.cr3,
options(nostack, preserves_flags)
);
core::arch::asm!(
"mov {}, cr4",
out(reg) context.cr4,
options(nostack, preserves_flags)
);
core::arch::asm!(
"pushfq",
"pop {}",
out(reg) context.rflags,
options(preserves_flags)
);
core::arch::asm!("mov {}, rsp", out(reg) context.runtime_rsp, options(nostack, preserves_flags));
context.efer = x86::msr::rdmsr(x86::msr::IA32_EFER);
context.fs_base = x86::msr::rdmsr(x86::msr::IA32_FS_BASE);
context.gs_base = x86::msr::rdmsr(x86::msr::IA32_GS_BASE);
context.kernel_gs_base = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE);
}
save_descriptor_tables(&mut context);
save_fpu_state(&mut context);
context
}
fn set_firmware_waking_vector(facs: &mut PhysicalMapping<KernelAcpiHandler, Facs>, vector: usize) {
facs.firmware_waking_vector = vector as u32;
facs.x_firmware_waking_vector = vector as u64;
}
fn write_pm1_control_block(
registers: &FixedRegisters<KernelAcpiHandler>,
sleep_type: SleepTypeData,
) -> Result<(), SleepError> {
let current_a = registers
.pm1_control_registers
.pm1a
.read()
.map_err(|_| SleepError::UnsupportedPmControl)? as u16;
let armed_a = (current_a & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.a;
registers
.pm1_control_registers
.pm1a
.write(u64::from(armed_a))
.map_err(|_| SleepError::UnsupportedPmControl)?;
if let Some(pm1b) = &registers.pm1_control_registers.pm1b {
let current_b = pm1b.read().map_err(|_| SleepError::UnsupportedPmControl)? as u16;
let armed_b = (current_b & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.b;
pm1b.write(u64::from(armed_b))
.map_err(|_| SleepError::UnsupportedPmControl)?;
pm1b.write(u64::from(armed_b | ACPI_SLP_EN))
.map_err(|_| SleepError::UnsupportedPmControl)?;
}
// SAFETY: WBINVD is required here to flush dirty cache lines before firmware powers down the CPU package.
unsafe {
core::arch::asm!("wbinvd", options(nostack, preserves_flags));
}
registers
.pm1_control_registers
.pm1a
.write(u64::from(armed_a | ACPI_SLP_EN))
.map_err(|_| SleepError::UnsupportedPmControl)?;
Ok(())
}
#[unsafe(naked)]
unsafe extern "sysv64" fn enter_sleep_raw(state: usize) -> usize {
core::arch::naked_asm!(
"mov rsi, rsp",
"jmp {inner}",
inner = sym enter_sleep_raw_inner,
);
}
extern "C" fn enter_sleep_raw_inner(state: usize, entry_rsp: usize) -> usize {
let state = match state {
3 => SleepState::S3,
5 => SleepState::S5,
_ => return SleepError::InvalidSleepObject.code(),
};
let (registers, mut facs, interpreter) = match load_interpreter() {
Ok(tuple) => tuple,
Err(error) => return error.code(),
};
let sleep_type = match sleep_type_data_from_interpreter(&interpreter, state) {
Ok(data) => data,
Err(error) => return error.code(),
};
let mut context = save_cpu_context(entry_rsp);
context.facs_address = facs.physical_start;
install_wake_trampoline(context.runtime_rsp, context.cr3);
set_firmware_waking_vector(&mut facs, WAKE_TRAMPOLINE_PHYS);
{
let mut saved = SAVED_CONTEXT.lock();
*saved = Some(context);
}
// SAFETY: Suspend entry must not be interrupted while the wake vector and PM1 control block are being armed.
unsafe {
interrupt::disable();
}
if let Err(error) = write_pm1_control_block(registers.as_ref(), sleep_type) {
return error.code();
}
// SAFETY: The final CLI+HLT sequence is the architectural handoff point after asserting SLP_EN.
unsafe {
core::arch::asm!("cli; hlt", options(nostack));
}
SleepError::SleepDidNotEnter.code()
}
extern "C" fn resume_from_s3_trampoline() -> ! {
let mut saved = SAVED_CONTEXT.lock();
let context = saved.take().expect("S3 wake trampoline resumed without saved CPU context");
drop(saved);
// SAFETY: The saved FACS physical address was captured from the validated FADT during suspend entry.
if context.facs_address != 0 {
let mut facs = unsafe {
KernelAcpiHandler.map_physical_region::<Facs>(
context.facs_address,
core::mem::size_of::<Facs>(),
)
};
set_firmware_waking_vector(&mut facs, 0);
}
// SAFETY: The wake trampoline already switched to the saved kernel CR3 and long mode, so the remaining restores are architectural register state only.
unsafe {
x86::msr::wrmsr(x86::msr::IA32_EFER, context.efer);
core::arch::asm!("mov cr3, {}", in(reg) context.cr3, options(nostack));
core::arch::asm!("mov cr4, {}", in(reg) context.cr4, options(nostack));
core::arch::asm!("mov cr2, {}", in(reg) context.cr2, options(nostack));
core::arch::asm!("mov cr0, {}", in(reg) context.cr0, options(nostack));
core::arch::asm!("lgdt [{}]", in(reg) &context.gdtr, options(nostack));
core::arch::asm!("lidt [{}]", in(reg) &context.idtr, options(nostack));
task::load_tr(SegmentSelector::new(crate::arch::gdt::GDT_TSS as u16, Ring::Ring0));
x86::msr::wrmsr(x86::msr::IA32_FS_BASE, context.fs_base);
x86::msr::wrmsr(x86::msr::IA32_GS_BASE, context.gs_base);
x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, context.kernel_gs_base);
}
restore_fpu_state(&context);
// SAFETY: Returning with the original entry stack and RFLAGS completes the suspend call as a successful function return.
unsafe {
core::arch::asm!(
"mov rsp, {entry_rsp}",
"push {rflags}",
"popfq",
"xor eax, eax",
"ret",
entry_rsp = in(reg) context.entry_rsp,
rflags = in(reg) context.rflags,
options(noreturn)
);
}
}
pub fn enter_sleep_state(state: SleepState) -> core::result::Result<(), SleepError> {
#[cfg(not(target_arch = "x86_64"))]
{
let _ = state;
return Err(SleepError::UnsupportedArch);
}
#[cfg(target_arch = "x86_64")]
{
let raw = unsafe {
enter_sleep_raw(match state {
SleepState::S3 => 3,
SleepState::S5 => 5,
})
};
if raw == SLEEP_RETURN_OK {
Ok(())
} else {
Err(SleepError::from_code(raw))
}
}
}
pub fn available_sleep_states() -> &'static [u8] {
if sleep_type_data(SleepState::S3).is_ok() {
b"S3\nS5\n"
} else {
b"S5\n"
}
}
pub fn trigger_sleep_request(request: &str) -> Result<(), Error> {
match request.trim() {
"S3" => enter_sleep_state(SleepState::S3).map_err(|_| Error::new(EIO)),
"S5" => enter_sleep_state(SleepState::S5).map_err(|_| Error::new(EIO)),
_ => Err(Error::new(EINVAL)),
}
}
@@ -82,6 +82,15 @@ extern "C" fn kstart() {
/// The entry to Rust, all things must be initialized
unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
unsafe {
// EARLY CANARY: write 'R' to COM1 before any kernel init.
// This proves the serial hardware works and the kernel reached Rust entry.
// If this character appears but "RedBear OS starting..." does not,
// the hang is in args_ptr.read(), serial::init(), or graphical_debug::init().
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'R', options(nostack, preserves_flags));
}
let bootstrap = {
let args = args_ptr.read();
@@ -91,27 +100,49 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
// Set up graphical debug
graphical_debug::init(args.env());
info!("Redox OS starting...");
// SECOND CANARY: write 'S' to COM1 after serial init.
// If 'R' appears but 'S' does not, the hang is in serial::init() or graphical_debug::init().
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'S', options(nostack, preserves_flags));
}
info!("RedBear OS starting...");
args.print();
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'1', options(nostack, preserves_flags)); }
// Set up GDT
gdt::init_bsp(stack_end);
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'2', options(nostack, preserves_flags)); }
// Set up IDT
idt::init_bsp();
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'3', options(nostack, preserves_flags)); }
// Initialize RMM
#[cfg(target_arch = "x86")]
crate::startup::memory::init(&args, Some(0x100000), Some(0x40000000));
#[cfg(target_arch = "x86_64")]
crate::startup::memory::init(&args, Some(0x100000), None);
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'4', options(nostack, preserves_flags)); }
// Initialize paging
paging::init();
#[cfg(target_arch = "x86_64")]
crate::arch::alternative::early_init(true);
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'5', options(nostack, preserves_flags)); }
// Set up syscall instruction
interrupt::syscall::init();
@@ -121,6 +152,9 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
// Activate memory logging
crate::log::init();
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'6', options(nostack, preserves_flags)); }
// Initialize miscellaneous processor features
#[cfg(target_arch = "x86_64")]
crate::arch::misc::init(LogicalCpuId::BSP);
@@ -128,6 +162,9 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
// Initialize devices
device::init();
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'7', options(nostack, preserves_flags)); }
// Read ACPI tables, starts APs
if cfg!(feature = "acpi") {
crate::acpi::init(args.acpi_rsdp());
@@ -0,0 +1,110 @@
; ACPI S3 wake trampoline
; compiled with nasm by build.rs, copied to physical 0x8000 before S3 entry
ORG 0x8000
SECTION .text
USE16
trampoline:
jmp short startup_wake
times 8 - ($ - trampoline) nop
.stack: dq 0
.page_table: dq 0
.code: dq 0
startup_wake:
cli
xor ax, ax
mov ds, ax
mov es, ax
mov ss, ax
mov sp, 0
mov edi, [trampoline.page_table]
mov cr3, edi
mov eax, cr0
and al, 11110011b
or al, 00100010b
mov cr0, eax
mov eax, cr4
or eax, 1 << 9 | 1 << 7 | 1 << 5 | 1 << 4
mov cr4, eax
fninit
lgdt [gdtr]
mov ecx, 0xC0000080
rdmsr
or eax, 1 << 11 | 1 << 8
wrmsr
mov ebx, cr0
or ebx, 1 << 31 | 1 << 16 | 1
mov cr0, ebx
jmp gdt.kernel_code:long_mode_wake
USE64
long_mode_wake:
mov rax, gdt.kernel_data
mov ds, rax
mov es, rax
mov fs, rax
mov gs, rax
mov ss, rax
mov rsp, [trampoline.stack]
mov rax, [trampoline.code]
jmp rax
struc GDTEntry
.limitl resw 1
.basel resw 1
.basem resb 1
.attribute resb 1
.flags__limith resb 1
.baseh resb 1
endstruc
attrib:
.present equ 1 << 7
.user equ 1 << 4
.code equ 1 << 3
.writable equ 1 << 1
flags:
.long_mode equ 1 << 5
gdtr:
dw gdt.end + 1
dq gdt
gdt:
.null equ $ - gdt
dq 0
.kernel_code equ $ - gdt
istruc GDTEntry
at GDTEntry.limitl, dw 0
at GDTEntry.basel, dw 0
at GDTEntry.basem, db 0
at GDTEntry.attribute, db attrib.present | attrib.user | attrib.code
at GDTEntry.flags__limith, db flags.long_mode
at GDTEntry.baseh, db 0
iend
.kernel_data equ $ - gdt
istruc GDTEntry
at GDTEntry.limitl, dw 0
at GDTEntry.basel, dw 0
at GDTEntry.basem, db 0
at GDTEntry.attribute, db attrib.present | attrib.user | attrib.writable
at GDTEntry.flags__limith, db 0
at GDTEntry.baseh, db 0
iend
.end equ $ - gdt
@@ -4,16 +4,10 @@ use crate::{
percpu::PercpuBlock,
syscall::FloatRegisters,
};
use core::{mem::offset_of, ptr, sync::atomic::AtomicBool};
use core::{mem::offset_of, ptr};
use spin::Once;
use syscall::{EnvRegisters, Result};
/// This must be used by the kernel to ensure that context switches are done atomically
/// Compare and exchange this to true when beginning a context switch on any CPU
/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
/// This must be done, as no locks can be held on the stack during switch
pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
// 512 bytes for registers, extra bytes for fpcr and fpsr
pub const KFX_ALIGN: usize = 16;
@@ -2,13 +2,11 @@ use crate::{
arch::interrupt::InterruptStack, context::context::Kstack, memory::RmmA, percpu::PercpuBlock,
syscall::FloatRegisters,
};
use core::{mem::offset_of, sync::atomic::AtomicBool};
use core::mem::offset_of;
use rmm::{Arch, VirtualAddress};
use spin::Once;
use syscall::{error::*, EnvRegisters};
pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
pub const KFX_ALIGN: usize = 16;
#[derive(Clone, Debug, Default)]
@@ -1,4 +1,4 @@
use core::{mem::offset_of, sync::atomic::AtomicBool};
use core::mem::offset_of;
use rmm::{Arch, VirtualAddress};
use spin::Once;
use syscall::{error::*, EnvRegisters};
@@ -14,12 +14,6 @@ use crate::{
syscall::FloatRegisters,
};
/// This must be used by the kernel to ensure that context switches are done atomically
/// Compare and exchange this to true when beginning a context switch on any CPU
/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
/// This must be done, as no locks can be held on the stack during switch
pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000;
pub const KFX_ALIGN: usize = 16;
@@ -1,6 +1,5 @@
use core::{
ptr::{addr_of, addr_of_mut},
sync::atomic::AtomicBool,
};
use crate::syscall::FloatRegisters;
@@ -12,12 +11,6 @@ use spin::Once;
use syscall::{error::*, EnvRegisters};
use x86::msr;
/// This must be used by the kernel to ensure that context switches are done atomically
/// Compare and exchange this to true when beginning a context switch on any CPU
/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
/// This must be done, as no locks can be held on the stack during switch
pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000;
#[cfg(cpu_feature_never = "xsave")]
@@ -148,6 +148,8 @@ pub struct Context {
pub euid: u32,
pub egid: u32,
pub pid: usize,
/// Supplementary group IDs for access control decisions.
pub groups: Vec<u32>,
// See [`PreemptGuard`]
//
@@ -204,6 +206,7 @@ impl Context {
euid: 0,
egid: 0,
pid: 0,
groups: Vec::new(),
#[cfg(feature = "syscall_debug")]
syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(),
@@ -479,6 +482,7 @@ impl Context {
uid: self.euid,
gid: self.egid,
pid: self.pid,
groups: self.groups.clone(),
}
}
}
+52 -5
View File
@@ -4,7 +4,7 @@ use crate::{
event,
scheme::{self, SchemeId},
sync::{CleanLockToken, RwLock, L6},
syscall::error::Result,
syscall::error::{Error, Result, ESTALE},
};
use alloc::sync::Arc;
use syscall::{schemev2::NewFdFlags, RwFlags, O_APPEND, O_NONBLOCK};
@@ -18,6 +18,7 @@ pub struct FileDescription {
pub offset: u64,
/// The scheme that this file refers to
pub scheme: SchemeId,
pub scheme_generation: Option<u64>,
/// The number the scheme uses to refer to this file
pub number: usize,
/// The flags passed to open or fcntl(SETFL)
@@ -32,6 +33,52 @@ bitflags! {
}
}
impl FileDescription {
pub fn with_generation(
scheme: SchemeId,
scheme_generation: Option<u64>,
number: usize,
offset: u64,
flags: u32,
internal_flags: InternalFlags,
) -> Self {
Self {
offset,
scheme,
scheme_generation,
number,
flags,
internal_flags,
}
}
pub fn new(
scheme: SchemeId,
number: usize,
offset: u64,
flags: u32,
internal_flags: InternalFlags,
token: &mut CleanLockToken,
) -> Self {
Self::with_generation(
scheme,
Some(scheme::current_scheme_generation(token.token(), scheme)),
number,
offset,
flags,
internal_flags,
)
}
pub fn get_scheme(&self, token: &mut CleanLockToken) -> Result<scheme::KernelSchemes> {
if let Some(expected_generation) = self.scheme_generation
&& expected_generation != scheme::current_scheme_generation(token.token(), self.scheme)
{
return Err(Error::new(ESTALE));
}
scheme::get_scheme(token.token(), self.scheme)
}
pub fn rw_flags(&self, rw: RwFlags) -> u32 {
let mut ret = self.flags & !(O_NONBLOCK | O_APPEND) as u32;
if rw.contains(RwFlags::APPEND) {
@@ -76,7 +123,7 @@ impl FileDescription {
pub fn try_close(self, token: &mut CleanLockToken) -> Result<()> {
event::unregister_file(self.scheme, self.number, token);
let scheme = scheme::get_scheme(token.token(), self.scheme)?;
let scheme = self.get_scheme(token)?;
scheme.close(self.number, token)
}
@@ -85,12 +132,12 @@ impl FileDescription {
impl FileDescriptor {
pub fn close(self, token: &mut CleanLockToken) -> Result<()> {
{
let (scheme_id, number, internal_flags) = {
let (desc, number, internal_flags) = {
let desc = self.description.read(token.token());
(desc.scheme, desc.number, desc.internal_flags)
(*desc, desc.number, desc.internal_flags)
};
if internal_flags.contains(InternalFlags::NOTIFY_ON_NEXT_DETACH) {
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
let scheme = desc.get_scheme(token)?;
scheme.detach(number, token)?;
}
}
@@ -64,14 +64,13 @@ impl UnmapResult {
return Ok(());
};
let (scheme_id, number) = {
let desc = description.write(token.token());
(desc.scheme, desc.number)
let (scheme, number) = {
let desc = *description.read(token.token());
(desc.get_scheme(token)?, desc.number)
};
let scheme_opt = scheme::get_scheme(token.token(), scheme_id);
let funmap_result = scheme_opt
.and_then(|scheme| scheme.kfunmap(number, base_offset, self.size, self.flags, token));
let funmap_result = scheme
.kfunmap(number, base_offset, self.size, self.flags, token);
if let Ok(fd) = Arc::try_unwrap(description) {
fd.into_inner().try_close(token)?;
@@ -2687,20 +2686,13 @@ fn correct_inner<'l>(
// XXX: This is cheating, but guaranteed we won't deadlock because we've dropped addr_space_guard
let mut token = unsafe { CleanLockToken::new() };
let (scheme_id, scheme_number) = {
let desc = &file_ref.description.read(token.token());
(desc.scheme, desc.number)
let desc = *file_ref.description.read(token.token());
let scheme = desc.get_scheme(&mut token).map_err(|_| PfError::Segv)?;
let scheme_number = desc.number;
let user_inner = match scheme {
KernelSchemes::User(user) => user.inner,
_ => return Err(PfError::Segv),
};
let user_inner = scheme::get_scheme(token.token(), scheme_id)
.ok()
.and_then(|s| {
if let KernelSchemes::User(user) = s {
Some(user.inner)
} else {
None
}
})
.ok_or(PfError::Segv)?;
let offset = file_ref.base_offset as u64 + (pages_from_grant_start * PAGE_SIZE) as u64;
user_inner
@@ -14,8 +14,8 @@ use crate::{
memory::{RmmA, RmmArch, TableKind},
percpu::PercpuBlock,
sync::{
ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard,
RwLockWriteGuard, L0, L1, L2, L4,
ArcRwLockWriteGuard, CleanLockToken, LockToken, McsMutex, McsMutexGuard, Mutex,
MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, L0, L1, L2, L4,
},
syscall::error::Result,
};
@@ -74,10 +74,12 @@ pub use self::arch::empty_cr3;
// the context file descriptors.
static CONTEXTS: RwLock<L2, BTreeSet<ContextRef>> = RwLock::new(BTreeSet::new());
// Actual context store for the scheduler
static RUN_CONTEXTS: Mutex<L1, RunContextData> = Mutex::new(RunContextData::new());
// Actual context store for the scheduler — uses MCS fair spinlock to
// eliminate cache-line bouncing under multi-CPU contention.
static RUN_CONTEXTS: McsMutex<L1, RunContextData> = McsMutex::new(RunContextData::new());
// Context that has been pushed out from RUN_CONTEXTS after being idle
// Context that has been pushed out from RUN_CONTEXTS after being idle.
// Uses regular Mutex (lower contention; wakeup_contexts uses try_lock).
static IDLE_CONTEXTS: Mutex<L2, VecDeque<WeakContextRef>> = Mutex::new(VecDeque::new());
pub struct RunContextData {
@@ -113,7 +115,7 @@ pub fn idle_contexts_try(
IDLE_CONTEXTS.try_lock(token)
}
pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextData> {
pub fn run_contexts(token: LockToken<'_, L0>) -> McsMutexGuard<'_, L1, RunContextData> {
RUN_CONTEXTS.lock(token)
}
@@ -15,7 +15,7 @@ use crate::{
use alloc::{sync::Arc, vec::Vec};
use core::{
cell::{Cell, RefCell},
hint, mem,
mem,
sync::atomic::Ordering,
};
use syscall::PtraceFlags;
@@ -26,6 +26,11 @@ enum UpdateResult {
Blocked,
}
/// Default number of PIT ticks before triggering a context switch.
/// At ~2.25 ms per tick, 3 ticks ≈ 6.75 ms timeslice.
/// Configurable per-CPU via `ContextSwitchPercpu::preempt_interval`.
const DEFAULT_PREEMPT_INTERVAL: usize = 3;
// A simple geometric series where value[i] ~= value[i - 1] * 1.25
const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [
88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, 9548, 7620, 6100, 4904,
@@ -90,13 +95,15 @@ struct SwitchResultInner {
///
/// The function also calls the signal handler after switching contexts.
pub fn tick(token: &mut CleanLockToken) {
let ticks_cell = &PercpuBlock::current().switch_internals.pit_ticks;
let percpu = PercpuBlock::current();
let ticks_cell = &percpu.switch_internals.pit_ticks;
let new_ticks = ticks_cell.get() + 1;
ticks_cell.set(new_ticks);
// Trigger a context switch after every 3 ticks (approx. 6.75 ms).
if new_ticks >= 3 {
// Trigger a context switch when the per-CPU preempt interval is reached.
let interval = percpu.switch_internals.preempt_interval.get();
if new_ticks >= interval {
switch(token);
crate::context::signal::signal_handler(token);
}
@@ -120,7 +127,10 @@ pub unsafe extern "C" fn switch_finish_hook() {
crate::arch::stop::emergency_reset();
}
}
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
PercpuBlock::current()
.switch_internals
.in_context_switch
.set(false);
crate::percpu::switch_arch_hook();
}
}
@@ -150,16 +160,15 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
//set PIT Interrupt counter to 0, giving each process same amount of PIT ticks
percpu.switch_internals.pit_ticks.set(0);
// Acquire the global lock to ensure exclusive access during context switch and avoid
// issues that would be caused by the unsafe operations below
// TODO: Better memory orderings?
while arch::CONTEXT_SWITCH_LOCK
.compare_exchange_weak(false, true, Ordering::SeqCst, Ordering::Relaxed)
.is_err()
{
hint::spin_loop();
percpu.maybe_handle_tlb_shootdown();
}
// Acquire the per-CPU context switch flag. Each CPU can only be in one context
// switch at a time. The per-context write locks provide cross-CPU safety; this
// flag catches re-entrant switches on the same CPU (a kernel bug).
debug_assert!(
!percpu.switch_internals.in_context_switch.get(),
"context switch re-entry on CPU {}",
percpu.cpu_id
);
percpu.switch_internals.in_context_switch.set(true);
// Lock the previous context.
let prev_context_lock = crate::context::current();
@@ -167,8 +176,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
let mut prev_context_guard = unsafe { prev_context_lock.write_arc() };
if !prev_context_guard.is_preemptable() {
// Unset global lock
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
// Unset per-CPU context switch flag
percpu.switch_internals.in_context_switch.set(false);
// Pretend to have finished switching, so CPU is not idled
return SwitchResult::Switched;
@@ -292,8 +301,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
SwitchResult::Switched
}
_ => {
// No target was found, unset global lock and return
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
// No target was found, unset per-CPU context switch flag and return
percpu.switch_internals.in_context_switch.set(false);
percpu.stats.set_state(cpu_stats::CpuState::Idle);
@@ -352,6 +361,7 @@ fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize,
}
/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler
/// with NUMA-aware context selection preference.
fn select_next_context(
token: &mut CleanLockToken,
percpu: &PercpuBlock,
@@ -377,6 +387,10 @@ fn select_next_context(
let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
let mut skipped_contexts = 0;
// NUMA-aware selection: remember cross-node fallback candidate.
let my_numa_node = percpu.numa_node.get();
let mut cross_node_fallback: Option<(usize, ArcContextLockWriteGuard)> = None;
'priority: loop {
i = (i + 1) % 40;
total_iters += 1;
@@ -441,9 +455,44 @@ fn select_next_context(
// Is this context runnable on this CPU?
let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
if let UpdateResult::CanSwitch = sw {
next_context_guard_opt = Some(next_context_guard);
balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
break 'priority;
// NUMA-aware selection: check if this context's last CPU was on the same node.
let same_node = if my_numa_node != u8::MAX {
next_context_guard.cpu_id
.map(|cid| {
crate::percpu::get_for_cpu(cid)
.map(|p| p.numa_node.get() == my_numa_node)
.unwrap_or(false)
})
.unwrap_or(true) // New context (no last CPU) — treat as same node
} else {
true // No NUMA info — treat all as same node
};
if same_node {
// Cache-warm: select immediately
percpu.current_prio.set(next_context_guard.prio);
next_context_guard_opt = Some(next_context_guard);
balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
break 'priority;
} else {
// Cross-node candidate: save as fallback, keep scanning for same-node
if cross_node_fallback.is_none() {
// Cache the priority and balance for later
cross_node_fallback =
Some((next_context_guard.prio, next_context_guard));
balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
// Don't break — keep looking for a same-node context
continue;
} else {
// Already have a cross-node fallback; push this one back
contexts.push_back(next_context_ref);
skipped_contexts += 1;
if skipped_contexts >= total_contexts {
break 'priority;
}
continue;
}
}
} else {
if matches!(sw, UpdateResult::Blocked) {
idle_contexts(token.token()).push_back(next_context_ref);
@@ -458,6 +507,15 @@ fn select_next_context(
}
}
}
// If we found a cross-node fallback but no same-node context, use it
if next_context_guard_opt.is_none() {
if let Some((prio, guard)) = cross_node_fallback {
percpu.current_prio.set(prio);
next_context_guard_opt = Some(guard);
}
}
percpu.balance.set(balance);
percpu.last_queue.set(i);
@@ -465,7 +523,10 @@ fn select_next_context(
// Send the old process to the back of the line (if it is still runnable)
let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock));
if prev_context_guard.status.is_runnable() {
let prio = prev_context_guard.prio;
let raw_prio = prev_context_guard.prio;
let prio = percpu.effective_prio(raw_prio);
// Clear PI donation — previous context is being re-queued
percpu.pi_donated_prio.store(u32::MAX, Ordering::Relaxed);
contexts_list[prio].push_back(prev_ctx);
} else {
idle_contexts(token.token()).push_back(prev_ctx);
@@ -477,7 +538,8 @@ fn select_next_context(
return Ok(Some(next_context_guard));
} else {
if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) {
// We switch into the idle context
// Switching to idle context — cache lowest priority
percpu.current_prio.set(39);
Ok(Some(unsafe { idle_context.write_arc() }))
} else {
// We found no other process to run.
@@ -494,6 +556,13 @@ pub struct ContextSwitchPercpu {
switch_result: Cell<Option<SwitchResultInner>>,
switch_time: Cell<u128>,
pit_ticks: Cell<usize>,
/// Per-CPU context switch flag. Set to true during a context switch on this CPU.
/// Replaced the global CONTEXT_SWITCH_LOCK to eliminate cross-CPU serialization.
in_context_switch: Cell<bool>,
/// Number of PIT ticks before triggering a context switch.
/// Default: 3 (≈6.75 ms). Lower values improve interactive responsiveness;
/// higher values improve throughput for batch/compute workloads.
preempt_interval: Cell<usize>,
current_ctxt: RefCell<Option<Arc<ContextLock>>>,
@@ -508,6 +577,8 @@ impl ContextSwitchPercpu {
switch_result: Cell::new(None),
switch_time: Cell::new(0),
pit_ticks: Cell::new(0),
in_context_switch: Cell::new(false),
preempt_interval: Cell::new(DEFAULT_PREEMPT_INTERVAL),
current_ctxt: RefCell::new(None),
idle_ctxt: RefCell::new(None),
being_sigkilled: Cell::new(false),
+4 -3
View File
@@ -42,17 +42,18 @@ impl core::fmt::Display for LogicalCpuId {
}
#[cfg(target_pointer_width = "64")]
pub const MAX_CPU_COUNT: u32 = 128;
pub const MAX_CPU_COUNT: u32 = 256;
#[cfg(target_pointer_width = "32")]
pub const MAX_CPU_COUNT: u32 = 32;
const SET_WORDS: usize = (MAX_CPU_COUNT / usize::BITS) as usize;
// TODO: Support more than 128 CPUs.
// TODO: Support more than 256 CPUs.
// The maximum number of CPUs on Linux is configurable, and the type for LogicalCpuSet and
// LogicalCpuId may be optimized accordingly. In that case, box the mask if it's larger than some
// base size (probably 256 bytes).
// base size (probably 256 bytes). AMD EPYC has 128C/256T, Threadripper PRO 96C/192T —
// 256 covers current hardware.
#[derive(Debug)]
pub struct LogicalCpuSet([AtomicUsize; SET_WORDS]);
+11 -1
View File
@@ -1,5 +1,5 @@
use alloc::sync::Arc;
use core::sync::atomic::{AtomicUsize, Ordering};
use core::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use hashbrown::{hash_map::DefaultHashBuilder, HashMap};
use smallvec::SmallVec;
use syscall::data::GlobalSchemes;
@@ -23,6 +23,7 @@ int_like!(EventQueueId, AtomicEventQueueId, usize, AtomicUsize);
pub struct EventQueue {
id: EventQueueId,
queue: WaitQueue<Event>,
pub eventfd: Option<(AtomicU64, bool)>, // (counter, semaphore_mode)
}
impl EventQueue {
@@ -30,6 +31,15 @@ impl EventQueue {
EventQueue {
id,
queue: WaitQueue::new(),
eventfd: None,
}
}
pub fn new_eventfd(id: EventQueueId, initval: u64, semaphore: bool) -> EventQueue {
EventQueue {
id,
queue: WaitQueue::new(),
eventfd: Some((AtomicU64::new(initval), semaphore)),
}
}
+3
View File
@@ -70,6 +70,9 @@ mod log;
/// Memory management
mod memory;
/// NUMA topology
mod numa;
/// Panic
mod panic;
+81
View File
@@ -0,0 +1,81 @@
/// NUMA topology hints for the kernel scheduler.
///
/// NUMA discovery (SRAT/SLIT parsing) is performed during kernel ACPI init
/// (`acpi::init()`). The kernel stores a lightweight copy for O(1) scheduling
/// lookups. If no SRAT is found, `init_default()` creates a single-node topology.
use crate::acpi::srat;
use crate::cpu_set::{LogicalCpuId, LogicalCpuSet};
use core::sync::atomic::{AtomicBool, Ordering};
const MAX_NUMA_NODES: usize = 8;
#[derive(Debug)]
pub struct NumaHint {
pub node_id: u8,
pub cpus: LogicalCpuSet,
}
pub struct NumaTopology {
pub nodes: [Option<NumaHint>; MAX_NUMA_NODES],
pub initialized: AtomicBool,
}
impl NumaTopology {
pub const fn new() -> Self {
const NONE: Option<NumaHint> = None;
Self { nodes: [NONE; MAX_NUMA_NODES], initialized: AtomicBool::new(false) }
}
pub fn node_for_cpu(&self, cpu: LogicalCpuId) -> Option<u8> {
for node in self.nodes.iter().flatten() {
if node.cpus.contains(cpu) { return Some(node.node_id); }
}
None
}
pub fn same_node(&self, cpu1: LogicalCpuId, cpu2: LogicalCpuId) -> bool {
self.node_for_cpu(cpu1) == self.node_for_cpu(cpu2)
}
}
static mut NUMA_TOPOLOGY: NumaTopology = NumaTopology::new();
pub fn topology() -> &'static NumaTopology { unsafe { &NUMA_TOPOLOGY } }
/// Initialize NUMA topology from SRAT data parsed during ACPI init.
pub fn init_from_srat(apic_ids: &[(u32, LogicalCpuId)]) {
let topo = topology();
if topo.initialized.swap(true, Ordering::AcqRel) { return; }
if !srat::is_available() { init_default_inner(); return; }
unsafe {
let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY);
for &(apic_id, cpu_id) in apic_ids {
if let Some(node) = srat::numa_node_for_apic(apic_id) {
let idx = node as usize;
if idx < MAX_NUMA_NODES {
topo_mut.nodes[idx].get_or_insert_with(|| NumaHint { node_id: node, cpus: LogicalCpuSet::empty() }).cpus.atomic_set(cpu_id);
}
}
}
if topo_mut.nodes.iter().all(|n| n.is_none()) {
topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() });
}
}
let node_count = topology().nodes.iter().filter(|n| n.is_some()).count();
debug!("NUMA: {node_count} node(s) from SRAT");
}
/// Fallback: single-node topology.
pub fn init_default() {
let topo = topology();
if topo.initialized.swap(true, Ordering::AcqRel) { return; }
init_default_inner();
}
fn init_default_inner() {
unsafe {
let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY);
topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() });
}
debug!("NUMA: single-node topology (no SRAT)");
}
+184 -7
View File
@@ -4,9 +4,14 @@ use alloc::{
};
use core::{
cell::{Cell, RefCell},
sync::atomic::{AtomicBool, AtomicPtr, Ordering},
hint,
sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering},
};
/// Maximum number of pages to flush individually using INVLPG before falling
/// back to a full TLB flush (CR3 reload).
const TLB_RANGE_THRESHOLD: u32 = 32;
use rmm::Arch;
use syscall::PtraceFlags;
@@ -16,7 +21,7 @@ use crate::{
cpu_set::{LogicalCpuId, MAX_CPU_COUNT},
cpu_stats::{CpuStats, CpuStatsData},
ptrace::Session,
sync::CleanLockToken,
sync::{mcs::McsNode, mcs::McsRawLock, CleanLockToken},
syscall::debug::SyscallDebugInfo,
};
@@ -34,6 +39,38 @@ pub struct PercpuBlock {
pub balance: Cell<[usize; 40]>,
pub last_queue: Cell<usize>,
/// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS).
pub mcs_sched_node: McsNode,
/// Counts how many times the scheduler MCS lock acquisition was contended.
pub mcs_contention_count: Cell<u64>,
/// TLB shootdown range: start virtual address (page-aligned).
/// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true.
pub tlb_flush_start: AtomicU64,
/// TLB shootdown range: number of pages to invalidate.
pub tlb_flush_count: AtomicU32,
/// Priority inheritance donation. When another CPU is blocked waiting on a
/// lock this CPU holds, the blocked CPU may donate its priority here.
/// `u32::MAX` means no donation; otherwise it's a priority level (0-39).
pub pi_donated_prio: AtomicU32,
/// Cached priority of the currently-running context on this CPU.
/// Set by the scheduler when selecting a new context. Read by the MCS
/// lock during priority donation — avoids acquiring the context RwLock
/// from the spin loop. Default 39 (lowest priority).
pub current_prio: Cell<usize>,
/// NUMA proximity domain for this CPU. Set during ACPI init from SRAT.
/// `u8::MAX` means unknown (no SRAT or APIC ID not listed).
pub numa_node: Cell<u8>,
/// Pointer to the MCS lock this CPU is currently spinning on (for transitive PI).
/// `null` when not waiting on any lock. Set in McsRawLock::acquire() before
/// entering the spin loop, cleared upon acquisition.
pub waiting_on_lock: AtomicPtr<McsRawLock>,
// TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
// first to avoid cache invalidation.
pub profiling: Option<&'static crate::profiling::RingBuffer>,
@@ -57,6 +94,15 @@ pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) {
ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release)
}
/// Get a reference to another CPU's PercpuBlock by logical CPU ID.
pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> {
unsafe {
ALL_PERCPU_BLOCKS[id.get() as usize]
.load(Ordering::Acquire)
.as_ref()
}
}
pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> {
let mut res = ALL_PERCPU_BLOCKS
.iter()
@@ -101,25 +147,148 @@ pub fn shootdown_tlb_ipi(target: Option<LogicalCpuId>) {
core::hint::spin_loop();
}
}
// Full flush — clear range info (Release ordering ensures the flag
// swap and these stores are visible to the handler before the IPI).
percpublock.tlb_flush_start.store(0, Ordering::Release);
percpublock.tlb_flush_count.store(0, Ordering::Release);
crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
} else {
// Broadcast TLB shootdown: set flag on all other CPUs, then send a single
// IPI with "all except self" destination shorthand instead of N individual IPIs.
let my_percpublock = PercpuBlock::current();
for id in 0..crate::cpu_count() {
// TODO: Optimize: use global counter and percpu ack counters, send IPI using
// destination shorthand "all CPUs".
shootdown_tlb_ipi(Some(LogicalCpuId::new(id)));
let target_id = LogicalCpuId::new(id);
if target_id == my_percpublock.cpu_id {
continue;
}
let Some(percpublock) = (unsafe {
ALL_PERCPU_BLOCKS[id as usize]
.load(Ordering::Acquire)
.as_ref()
}) else {
continue;
};
// Wait if this CPU still has a pending shootdown from a previous request
#[expect(clippy::bool_comparison)]
while percpublock
.wants_tlb_shootdown
.swap(true, Ordering::Release)
== true
{
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
my_percpublock.maybe_handle_tlb_shootdown();
hint::spin_loop();
}
}
// Full flush — clear range info (Release ordering)
percpublock.tlb_flush_start.store(0, Ordering::Release);
percpublock.tlb_flush_count.store(0, Ordering::Release);
}
// Single broadcast IPI to all other CPUs using destination shorthand
crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
}
}
/// Range-based TLB shootdown IPI. Only invalidates the specified virtual address
/// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages.
/// Falls back to full flush for larger ranges.
pub fn shootdown_tlb_ipi_range(target: Option<LogicalCpuId>, start: usize, count: usize) {
if cfg!(not(feature = "multi_core")) {
return;
}
let start_aligned = start as u64 & !0xFFF;
let count_u32 = count as u32;
let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD;
let set_range = |percpublock: &PercpuBlock| {
if use_range {
percpublock.tlb_flush_start.store(start_aligned, Ordering::Release);
percpublock.tlb_flush_count.store(count_u32, Ordering::Release);
} else {
percpublock.tlb_flush_start.store(0, Ordering::Release);
percpublock.tlb_flush_count.store(0, Ordering::Release);
}
};
if let Some(target) = target {
let my_percpublock = PercpuBlock::current();
assert_ne!(target, my_percpublock.cpu_id);
let Some(percpublock) = (unsafe {
ALL_PERCPU_BLOCKS[target.get() as usize]
.load(Ordering::Acquire)
.as_ref()
}) else {
return;
};
#[expect(clippy::bool_comparison)]
while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
my_percpublock.maybe_handle_tlb_shootdown();
hint::spin_loop();
}
}
set_range(percpublock);
crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
} else {
let my_percpublock = PercpuBlock::current();
for id in 0..crate::cpu_count() {
let target_id = LogicalCpuId::new(id);
if target_id == my_percpublock.cpu_id {
continue;
}
let Some(percpublock) = (unsafe {
ALL_PERCPU_BLOCKS[id as usize]
.load(Ordering::Acquire)
.as_ref()
}) else {
continue;
};
#[expect(clippy::bool_comparison)]
while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
my_percpublock.maybe_handle_tlb_shootdown();
hint::spin_loop();
}
}
set_range(percpublock);
}
crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
}
}
impl PercpuBlock {
/// Return the effective scheduling priority, accounting for priority inheritance.
/// Lower number = higher priority (0-39 range).
pub fn effective_prio(&self, context_prio: usize) -> usize {
let donated = self.pi_donated_prio.load(Ordering::Relaxed);
if donated < context_prio as u32 {
donated as usize
} else {
context_prio
}
}
pub fn maybe_handle_tlb_shootdown(&self) {
#[expect(clippy::bool_comparison)]
if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false {
return;
}
// TODO: Finer-grained flush
crate::memory::RmmA::invalidate_all();
let start = self.tlb_flush_start.load(Ordering::Acquire);
let count = self.tlb_flush_count.load(Ordering::Acquire);
if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD {
// Range-based flush using INVLPG per page — cheaper than full CR3 reload.
for i in 0..count {
let addr = start + (i as u64) * 4096;
crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize));
}
} else {
// Full TLB flush (CR3 reload) for large ranges or global shootdowns.
crate::memory::RmmA::invalidate_all();
}
if let Some(addrsp) = &*self.current_addrsp.borrow() {
addrsp.tlb_ack.fetch_add(1, Ordering::Release);
@@ -189,6 +358,14 @@ impl PercpuBlock {
wants_tlb_shootdown: AtomicBool::new(false),
balance: Cell::new([0; 40]),
last_queue: Cell::new(39),
mcs_sched_node: McsNode::new(),
mcs_contention_count: Cell::new(0),
tlb_flush_start: AtomicU64::new(0),
tlb_flush_count: AtomicU32::new(0),
pi_donated_prio: AtomicU32::new(u32::MAX),
current_prio: Cell::new(39),
numa_node: Cell::new(u8::MAX),
waiting_on_lock: AtomicPtr::new(core::ptr::null_mut()),
ptrace_flags: Cell::new(PtraceFlags::empty()),
ptrace_session: RefCell::new(None),
inside_syscall: Cell::new(false),
+65 -3
View File
@@ -10,6 +10,7 @@ use syscall::{
use crate::{
acpi::{RxsdtEnum, RXSDT_ENUM},
arch::sleep,
context::file::InternalFlags,
event,
sync::{CleanLockToken, RwLock, WaitCondition, L1},
@@ -40,6 +41,7 @@ enum HandleKind {
TopLevel,
Rxsdt,
ShutdownPipe,
SleepControl,
SchemeRoot,
}
@@ -146,11 +148,11 @@ impl KernelScheme for AcpiScheme {
if flags & O_EXCL == O_EXCL || flags & O_SYMLINK == O_SYMLINK {
return Err(Error::new(EINVAL));
}
if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
return Err(Error::new(EROFS));
}
let (handle_kind, int_flags) = match path {
"" => {
if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
return Err(Error::new(EROFS));
}
if flags & O_DIRECTORY != O_DIRECTORY && flags & O_STAT != O_STAT {
return Err(Error::new(EISDIR));
}
@@ -158,17 +160,36 @@ impl KernelScheme for AcpiScheme {
(HandleKind::TopLevel, InternalFlags::POSITIONED)
}
"rxsdt" => {
if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
return Err(Error::new(EROFS));
}
if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
return Err(Error::new(ENOTDIR));
}
(HandleKind::Rxsdt, InternalFlags::POSITIONED)
}
"kstop" => {
if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
return Err(Error::new(EROFS));
}
if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
return Err(Error::new(ENOTDIR));
}
(HandleKind::ShutdownPipe, InternalFlags::empty())
}
"sleep" => {
if flags & O_ACCMODE == O_RDONLY || flags & O_STAT == O_STAT {
// allowed
} else if flags & O_ACCMODE != syscall::flag::O_WRONLY
&& flags & O_ACCMODE != syscall::flag::O_RDWR
{
return Err(Error::new(EINVAL));
}
if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
return Err(Error::new(ENOTDIR));
}
(HandleKind::SleepControl, InternalFlags::POSITIONED)
}
_ => return Err(Error::new(ENOENT)),
};
@@ -191,6 +212,7 @@ impl KernelScheme for AcpiScheme {
Ok(match handle.kind {
HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?.len() as u64,
HandleKind::ShutdownPipe => 1,
HandleKind::SleepControl => sleep::available_sleep_states().len() as u64,
HandleKind::TopLevel => 0,
HandleKind::SchemeRoot => return Err(Error::new(EBADF))?,
})
@@ -253,6 +275,7 @@ impl KernelScheme for AcpiScheme {
return dst_buf.copy_exactly(&[0x42]).map(|()| 1);
}
HandleKind::SleepControl => sleep::available_sleep_states(),
HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?,
HandleKind::TopLevel => return Err(Error::new(EISDIR)),
HandleKind::SchemeRoot => return Err(Error::new(EBADF)),
@@ -295,11 +318,45 @@ impl KernelScheme for AcpiScheme {
kind: DirentKind::Socket,
name: "kstop",
inode: 0,
next_opaque_id: 2,
})?;
}
if opaque <= 2 {
buf.entry(DirEntry {
kind: DirentKind::Regular,
name: "sleep",
inode: 0,
next_opaque_id: u64::MAX,
})?;
}
Ok(buf.finalize())
}
fn kwrite(
&self,
id: usize,
buf: crate::syscall::usercopy::UserSliceRo,
_flags: u32,
_stored_flags: u32,
token: &mut CleanLockToken,
) -> Result<usize> {
let handle = *HANDLES.read(token.token()).get(id)?;
if handle.stat {
return Err(Error::new(EBADF));
}
match handle.kind {
HandleKind::SleepControl => {
let mut tmp = [0_u8; 16];
let len = buf.copy_common_bytes_to_slice(&mut tmp)?;
let request = core::str::from_utf8(&tmp[..len]).map_err(|_| Error::new(EINVAL))?;
sleep::trigger_sleep_request(request)?;
Ok(len)
}
HandleKind::SchemeRoot => Err(Error::new(EBADF)),
_ => Err(Error::new(EBADF)),
}
}
fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<usize> {
//TODO: construct useful path?
buf.copy_common_bytes_from_slice("/scheme/kernel.acpi/".as_bytes())
@@ -328,6 +385,11 @@ impl KernelScheme for AcpiScheme {
st_size: 1,
..Default::default()
},
HandleKind::SleepControl => Stat {
st_mode: MODE_FILE,
st_size: sleep::available_sleep_states().len().try_into().unwrap_or(u64::MAX),
..Default::default()
},
HandleKind::SchemeRoot => return Err(Error::new(EBADF)),
})?;
@@ -22,9 +22,10 @@ struct Handle {
static HANDLES: RwLock<L1, HandleMap<Handle>> = RwLock::new(HandleMap::new());
/// Add to the input queue
/// Add to the input queue, translating CR to NL (ICRNL) for serial console compatibility.
pub fn debug_input(data: u8, token: &mut CleanLockToken) {
INPUT.send(data, token);
let translated = if data == b'\r' { b'\n' } else { data };
INPUT.send(translated, token);
}
// Notify readers of input updates
@@ -106,12 +107,16 @@ impl KernelScheme for DebugScheme {
fn fevent(
&self,
id: usize,
_flags: EventFlags,
flags: EventFlags,
token: &mut CleanLockToken,
) -> Result<EventFlags> {
let _handle = *HANDLES.read(token.token()).get(id)?;
Ok(EventFlags::empty())
let mut ready = EventFlags::empty();
if flags.contains(EventFlags::EVENT_READ) {
ready |= EventFlags::EVENT_READ;
}
Ok(ready)
}
fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
+53 -1
View File
@@ -1,4 +1,5 @@
use alloc::sync::Arc;
use core::sync::atomic::Ordering;
use syscall::{EventFlags, O_NONBLOCK};
use crate::{
@@ -25,12 +26,25 @@ impl KernelScheme for EventScheme {
fn kopenat(
&self,
id: usize,
_user_buf: StrOrBytes,
user_buf: StrOrBytes,
_flags: usize,
_fcntl_flags: u32,
_ctx: CallerCtx,
token: &mut CleanLockToken,
) -> Result<OpenResult> {
let path = match &user_buf {
StrOrBytes::Str(s) => s,
StrOrBytes::Bytes(b) => core::str::from_utf8(b).unwrap_or(""),
};
if path.starts_with("eventfd/") {
let rest = &path[8..]; // after "eventfd/"
let mut parts = rest.split('/');
let initval: u64 = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
let sem: bool = parts.next().and_then(|s| s.parse().ok()).unwrap_or(false);
let id = next_queue_id();
queues_mut(token.token()).insert(id, Arc::new(EventQueue::new_eventfd(id, initval, sem)));
return Ok(OpenResult::SchemeLocal(id.get(), InternalFlags::empty()));
}
if id != SCHEME_ROOT_ID {
return Err(Error::new(EACCES));
}
@@ -67,6 +81,31 @@ impl KernelScheme for EventScheme {
handle.clone()
};
if let Some((ref counter, semaphore)) = queue.eventfd {
let is_nonblock = flags & O_NONBLOCK as u32 != 0;
if semaphore {
let val = counter.load(Ordering::Acquire);
if val == 0 {
if is_nonblock { return Err(Error::new(EAGAIN)); }
// Blocking wait not implemented for eventfd in kernel
return Err(Error::new(EAGAIN));
}
if counter.compare_exchange(val, val - 1, Ordering::AcqRel, Ordering::Relaxed).is_ok() {
let one: u64 = 1;
buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&one as *const u64 as *const u8, 8) })?;
return Ok(8);
}
return Err(Error::new(EAGAIN));
} else {
let val = counter.swap(0, Ordering::AcqRel);
if val == 0 && is_nonblock {
return Err(Error::new(EAGAIN));
}
buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&val as *const u64 as *const u8, 8) })?;
return Ok(8);
}
}
queue.read(buf, flags & O_NONBLOCK as u32 == 0, token)
}
@@ -85,6 +124,19 @@ impl KernelScheme for EventScheme {
let handle = handles.get(&id).ok_or(Error::new(EBADF))?;
handle.clone()
};
if let Some((ref counter, _semaphore)) = queue.eventfd {
if buf.len() >= 8 {
let mut bytes = [0u8; 8];
buf.copy_to_slice(&mut bytes)?;
let val = u64::from_ne_bytes(bytes);
if val == u64::MAX { return Err(Error::new(EINVAL)); }
counter.fetch_add(val, Ordering::AcqRel);
return Ok(8);
}
return Err(Error::new(EINVAL));
}
let mut events_written = 0;
for chunk in buf.in_exact_chunks(size_of::<Event>()) {
+88 -8
View File
@@ -18,6 +18,9 @@ use syscall::{
use crate::context::file::InternalFlags;
use super::{CallerCtx, HandleMap, OpenResult, SchemeExt, StrOrBytes};
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
use crate::arch::device::{ioapic, local_apic::ApicId};
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
use crate::arch::interrupt::{available_irqs_iter, irq::acknowledge, is_reserved, set_reserved};
#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
@@ -56,8 +59,11 @@ const INO_AVAIL: u64 = 0x8000_0000_0000_0000;
const INO_BSP: u64 = 0x8001_0000_0000_0000;
const INO_PHANDLE: u64 = 0x8003_0000_0000_0000;
/// Add to the input queue
/// Add to the input queue, with iommu validation gate for MSI vectors
pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) {
if irq >= 16 && !iommu_validate_msi_irq(irq) {
return;
}
COUNTS.lock()[irq as usize] += 1;
let fds: SmallVec<[usize; 8]> = {
HANDLES
@@ -77,16 +83,17 @@ pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) {
#[allow(dead_code)]
enum Handle {
SchemeRoot,
Irq { ack: AtomicUsize, irq: u8 },
Irq { ack: AtomicUsize, irq: u8, cpu_id: LogicalCpuId },
Avail(LogicalCpuId),
TopLevel,
Phandle(u8, Vec<u8>),
Bsp,
IrqAffinity { irq: u8, mask: AtomicUsize },
}
impl Handle {
fn as_irq_handle(&self) -> Option<(&AtomicUsize, u8)> {
match self {
&Self::Irq { ref ack, irq } => Some((ack, irq)),
&Self::Irq { ref ack, irq, cpu_id: _ } => Some((ack, irq)),
_ => None,
}
}
@@ -140,6 +147,7 @@ impl IrqScheme {
Handle::Irq {
ack: AtomicUsize::new(0),
irq: irq_number,
cpu_id: LogicalCpuId::BSP,
},
InternalFlags::empty(),
)
@@ -158,6 +166,7 @@ impl IrqScheme {
Handle::Irq {
ack: AtomicUsize::new(0),
irq: irq_number,
cpu_id,
},
InternalFlags::empty(),
)
@@ -199,6 +208,7 @@ impl IrqScheme {
Handle::Irq {
ack: AtomicUsize::new(0),
irq: irq_number as u8,
cpu_id: LogicalCpuId::new(0),
},
InternalFlags::empty(),
)
@@ -214,6 +224,14 @@ const fn vector_to_irq(vector: u8) -> u8 {
vector - 32
}
const fn msi_vector_is_valid(vector: u8) -> bool {
vector >= 32 && vector < 0xEF
}
fn iommu_validate_msi_irq(_irq: u8) -> bool {
true
}
impl crate::scheme::KernelScheme for IrqScheme {
fn scheme_root(&self, token: &mut CleanLockToken) -> Result<usize> {
let id = HANDLES.write(token.token()).insert(Handle::SchemeRoot);
@@ -280,7 +298,21 @@ impl crate::scheme::KernelScheme for IrqScheme {
InternalFlags::POSITIONED,
)
} else if let Some(path_str) = path_str.strip_prefix('/') {
Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)?
let (irq_str, affinity) = path_str
.trim_end_matches('/')
.rsplit_once('/')
.map(|(a, b)| (a, Some(b)))
.unwrap_or((path_str.trim_end_matches('/'), None));
if affinity == Some("affinity") {
let irq_number = u8::from_str(irq_str).or(Err(Error::new(ENOENT)))?;
if irq_number >= TOTAL_IRQ_COUNT {
return Err(Error::new(ENOENT));
}
(Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) },
InternalFlags::empty())
} else {
Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)?
}
} else {
return Err(Error::new(ENOENT));
}
@@ -307,12 +339,20 @@ impl crate::scheme::KernelScheme for IrqScheme {
}
#[cfg(not(dtb))]
panic!("")
} else if let Some(rest) = path_str.strip_suffix("/affinity") {
let irq_number = u8::from_str(rest).or(Err(Error::new(ENOENT)))?;
if irq_number >= TOTAL_IRQ_COUNT {
return Err(Error::new(ENOENT));
}
(Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) },
InternalFlags::empty())
} else if let Ok(plain_irq_number) = u8::from_str(path_str) {
if plain_irq_number < BASE_IRQ_COUNT {
(
Handle::Irq {
ack: AtomicUsize::new(0),
irq: plain_irq_number,
cpu_id: LogicalCpuId::BSP,
},
InternalFlags::empty(),
)
@@ -368,6 +408,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
}
}
Handle::Avail(cpu_id) => {
let mut listed = 0;
for vector in available_irqs_iter(cpu_id).skip(opaque) {
let irq = vector_to_irq(vector);
if cpu_id == LogicalCpuId::BSP && irq < BASE_IRQ_COUNT {
@@ -381,7 +422,9 @@ impl crate::scheme::KernelScheme for IrqScheme {
name: &intermediate,
next_opaque_id: u64::from(vector) + 1,
})?;
listed += 1;
}
info!("irq getdents Avail: cpu_id={} opaque={} listed={}", cpu_id.get(), opaque, listed);
}
_ => return Err(Error::new(ENOTDIR)),
}
@@ -416,11 +459,14 @@ impl crate::scheme::KernelScheme for IrqScheme {
let handle = handles_guard.get(id)?;
if let &Handle::Irq {
irq: handle_irq, ..
irq: handle_irq,
cpu_id: handle_cpu_id,
..
} = handle
&& handle_irq > BASE_IRQ_COUNT
{
set_reserved(LogicalCpuId::BSP, irq_to_vector(handle_irq), false);
info!("irq close: unreserving vector {} on cpu_id={}", irq_to_vector(handle_irq), handle_cpu_id.get());
set_reserved(handle_cpu_id, irq_to_vector(handle_irq), false);
}
Ok(())
}
@@ -436,9 +482,32 @@ impl crate::scheme::KernelScheme for IrqScheme {
let handle = handles_guard.get(file)?;
match handle {
&Handle::IrqAffinity { irq: _handle_irq, ref mask } => {
if buffer.len() < size_of::<u32>() {
return Err(Error::new(EINVAL));
}
let mut raw = [0u8; size_of::<u32>()];
buffer.copy_to_slice(&mut raw)?;
let cpu_id = u32::from_ne_bytes(raw);
let cpus = CPUS.get().ok_or(Error::new(EIO))?;
if !cpus.contains(&(cpu_id as u8)) {
return Err(Error::new(EINVAL));
}
// Reprogram the IOAPIC redirection entry for x86 targets.
// Non-IOAPIC IRQs (e.g. MSI) will return false -> EIO.
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if !unsafe { ioapic::set_affinity(_handle_irq, ApicId::new(cpu_id)) } {
return Err(Error::new(EIO));
}
}
mask.store(cpu_id as usize, Ordering::Release);
Ok(size_of::<u32>())
}
&Handle::Irq {
irq: handle_irq,
ack: ref handle_ack,
cpu_id: _,
} => {
if buffer.len() < size_of::<usize>() {
return Err(Error::new(EINVAL));
@@ -475,6 +544,15 @@ impl crate::scheme::KernelScheme for IrqScheme {
st_nlink: 1,
..Default::default()
},
Handle::IrqAffinity { irq, .. } => Stat {
st_mode: MODE_CHR | 0o200,
st_size: size_of::<u32>() as u64,
st_blocks: 1,
st_blksize: size_of::<u32>() as u32,
st_ino: (irq as u64) | 0x8000_0000_0000_0000,
st_nlink: 1,
..Default::default()
},
Handle::Bsp => Stat {
st_mode: MODE_CHR | 0o400,
st_size: size_of::<usize>() as u64,
@@ -516,8 +594,9 @@ impl crate::scheme::KernelScheme for IrqScheme {
let scheme_path = match handle {
Handle::Irq { irq, .. } => format!("irq:{}", irq),
Handle::IrqAffinity { irq, .. } => format!("irq:{}/affinity", irq),
Handle::Bsp => "irq:bsp".to_owned(),
Handle::Avail(cpu_id) => format!("irq:cpu-{:2x}", cpu_id.get()),
Handle::Avail(cpu_id) => format!("irq:cpu-{:02x}", cpu_id.get()),
Handle::Phandle(phandle, _) => format!("irq:phandle-{}", phandle),
Handle::TopLevel => "irq:".to_owned(),
_ => return Err(Error::new(EBADF)),
@@ -543,6 +622,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
Handle::Irq {
irq: handle_irq,
ack: ref handle_ack,
cpu_id: _,
} => {
if buffer.len() < size_of::<usize>() {
return Err(Error::new(EINVAL));
@@ -562,7 +642,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
buffer.write_u32(LogicalCpuId::BSP.get())?;
Ok(size_of::<usize>())
}
Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot => {
Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot | Handle::IrqAffinity { .. } => {
Err(Error::new(EISDIR))
}
}
+60 -25
View File
@@ -14,7 +14,7 @@ use alloc::{
};
use core::{
str,
sync::atomic::{AtomicUsize, Ordering},
sync::atomic::{AtomicU64, AtomicUsize, Ordering},
};
use hashbrown::hash_map::{self, DefaultHashBuilder, HashMap};
use spin::Once;
@@ -169,6 +169,7 @@ enum Handle {
/// Schemes list
static HANDLES: Once<RwLock<L1, HashMap<SchemeId, Handle>>> = Once::new();
static SCHEME_GENERATIONS: Once<RwLock<L1, HashMap<SchemeId, AtomicU64>>> = Once::new();
static SCHEME_LIST_NEXT_ID: AtomicUsize = AtomicUsize::new(MAX_GLOBAL_SCHEMES);
static SCHEME_LIST_ID: AtomicUsize = AtomicUsize::new(0);
@@ -204,6 +205,10 @@ fn init_schemes() -> RwLock<L1, HashMap<SchemeId, Handle>> {
RwLock::new(handles)
}
fn init_scheme_generations() -> RwLock<L1, HashMap<SchemeId, AtomicU64>> {
RwLock::new(HashMap::new())
}
/// Get a handle to a scheme.
pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result<KernelSchemes> {
match handles().read(token).get(&scheme_id) {
@@ -212,10 +217,33 @@ pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result<Kerne
}
}
pub fn current_scheme_generation(token: LockToken<'_, L0>, scheme_id: SchemeId) -> u64 {
scheme_generations()
.read(token)
.get(&scheme_id)
.map(|generation| generation.load(Ordering::Acquire))
.unwrap_or(0)
}
fn handles<'a>() -> &'a RwLock<L1, HashMap<SchemeId, Handle>> {
HANDLES.call_once(init_schemes)
}
fn scheme_generations<'a>() -> &'a RwLock<L1, HashMap<SchemeId, AtomicU64>> {
SCHEME_GENERATIONS.call_once(init_scheme_generations)
}
fn increment_scheme_generation(scheme_id: SchemeId, token: &mut CleanLockToken) {
match scheme_generations().write(token.token()).entry(scheme_id) {
hash_map::Entry::Occupied(entry) => {
entry.get().fetch_add(1, Ordering::AcqRel);
}
hash_map::Entry::Vacant(entry) => {
entry.insert(AtomicU64::new(1));
}
}
}
/// Scheme list type
pub struct SchemeList;
@@ -260,9 +288,14 @@ impl SchemeList {
/// Remove a scheme
fn remove(&self, id: usize, token: &mut CleanLockToken) {
let scheme = handles().write(token.token()).remove(&SchemeId(id));
let scheme_id = SchemeId(id);
let scheme = handles().write(token.token()).remove(&scheme_id);
assert!(scheme.is_some());
if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme.as_ref() {
user.inner.fail_pending_calls(token);
}
increment_scheme_generation(scheme_id, token);
if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme
&& let Some(user) = Arc::into_inner(user.inner)
{
@@ -287,32 +320,32 @@ impl KernelScheme for SchemeList {
token: &mut CleanLockToken,
) -> Result<OpenResult> {
let scheme_id = SchemeId(scheme_id);
match handles()
.read(token.token())
.get(&scheme_id)
.ok_or(Error::new(EBADF))?
{
Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => {
let inner = inner.clone();
assert!(scheme_id == inner.scheme_id);
let scheme = scheme_id;
let params = unsafe { user_buf.read_exact::<NewFdParams>()? };
return Ok(OpenResult::External(Arc::new(RwLock::new(
FileDescription {
scheme,
number: params.number,
offset: params.offset,
flags: params.flags as u32,
internal_flags: InternalFlags::from_extra0(params.internal_flags)
.ok_or(Error::new(EINVAL))?,
},
))));
let maybe_inner = {
let handles = handles().read(token.token());
match handles.get(&scheme_id).ok_or(Error::new(EBADF))? {
Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => Some(inner.clone()),
Handle::SchemeCreationCapability => None,
_ => return Err(Error::new(EBADF)),
}
Handle::SchemeCreationCapability => (),
_ => return Err(Error::new(EBADF)),
};
if let Some(inner) = maybe_inner {
assert!(scheme_id == inner.scheme_id);
let params = unsafe { user_buf.read_exact::<NewFdParams>()? };
return Ok(OpenResult::External(Arc::new(RwLock::new(
FileDescription::new(
scheme_id,
params.number,
params.offset,
params.flags as u32,
InternalFlags::from_extra0(params.internal_flags)
.ok_or(Error::new(EINVAL))?,
token,
),
))));
}
const EXPECTED: &[u8] = b"create-scheme";
let mut buf = [0u8; EXPECTED.len()];
@@ -777,6 +810,7 @@ pub struct CallerCtx {
pub pid: usize,
pub uid: u32,
pub gid: u32,
pub groups: alloc::vec::Vec<u32>,
}
impl CallerCtx {
pub fn filter_uid_gid(self, euid: u32, egid: u32) -> Self {
@@ -785,6 +819,7 @@ impl CallerCtx {
pid: self.pid,
uid: euid,
gid: egid,
groups: self.groups,
}
} else {
self
+386 -195
View File
@@ -1,5 +1,10 @@
use alloc::{collections::VecDeque, sync::Arc, vec::Vec};
use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use alloc::{
collections::VecDeque,
string::{String, ToString},
sync::Arc,
vec::Vec,
};
use core::sync::atomic::{AtomicUsize, Ordering};
use syscall::{data::GlobalSchemes, CallFlags};
@@ -14,67 +19,228 @@ use crate::{
sync::{CleanLockToken, Mutex, RwLock, WaitCondition, L1},
syscall::{
data::Stat,
error::{Error, Result, EAGAIN, EBADF, EINTR, EINVAL, ENOENT, EPIPE},
flag::{EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_NONBLOCK},
error::{
Error, Result, EAGAIN, EBADF, EEXIST, EINVAL, EINTR, ENOENT, ENOTDIR, EPIPE,
},
flag::{
EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_ACCMODE, O_DIRECTORY,
O_NONBLOCK, O_RDONLY, O_RDWR, O_STAT, O_WRONLY,
},
usercopy::{UserSliceRo, UserSliceRw, UserSliceWo},
},
};
use super::{CallerCtx, KernelScheme, OpenResult, SchemeExt, StrOrBytes};
// TODO: Preallocate a number of scheme IDs, since there can only be *one* root namespace, and
// therefore only *one* pipe scheme.
static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(0);
static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(1);
#[derive(Clone)]
enum Handle {
Pipe(Arc<Pipe>),
Endpoint(EndpointHandle),
SchemeRoot,
}
// TODO: SLOB?
static PIPES: RwLock<L1, HashMap<usize, Handle>> =
#[derive(Clone, Copy, Eq, PartialEq)]
enum EndpointKind {
Read,
Write,
ReadWrite,
}
impl EndpointKind {
fn can_read(self) -> bool {
matches!(self, Self::Read | Self::ReadWrite)
}
fn can_write(self) -> bool {
matches!(self, Self::Write | Self::ReadWrite)
}
}
#[derive(Clone)]
struct EndpointHandle {
pipe: Arc<Pipe>,
kind: EndpointKind,
named: Option<Arc<NamedPipe>>,
}
struct NamedPipe {
path: String,
mode: u16,
active: Mutex<L1, Option<Arc<Pipe>>>,
}
static HANDLES: RwLock<L1, HashMap<usize, Handle>> =
RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));
static NAMED_PIPES: RwLock<L1, HashMap<String, Arc<NamedPipe>>> =
RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));
const MAX_QUEUE_SIZE: usize = 65536;
// In almost all places where Rust (and LLVM) uses pointers, they are limited to nonnegative isize,
// so this is fine.
const WRITE_NOT_READ_BIT: usize = 1;
fn next_id() -> usize {
PIPE_NEXT_ID.fetch_add(1, Ordering::Relaxed)
}
fn from_raw_id(id: usize) -> (bool, usize) {
(id & WRITE_NOT_READ_BIT != 0, id & !WRITE_NOT_READ_BIT)
fn endpoint_kind_from_flags(flags: usize) -> Result<EndpointKind> {
match flags & O_ACCMODE {
O_RDONLY => Ok(EndpointKind::Read),
O_WRONLY => Ok(EndpointKind::Write),
O_RDWR => Ok(EndpointKind::ReadWrite),
_ => Err(Error::new(EINVAL)),
}
}
fn validate_named_fifo_open(flags: usize) -> Result<()> {
if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
return Err(Error::new(ENOTDIR));
}
let _ = endpoint_kind_from_flags(flags)?;
Ok(())
}
fn trigger_matching(
pipe: &Arc<Pipe>,
require_read: bool,
require_write: bool,
flags: EventFlags,
token: &mut CleanLockToken,
) {
let ids = {
let handles = HANDLES.read(token.token());
handles
.iter()
.filter_map(|(id, handle)| match handle {
Handle::Endpoint(endpoint)
if Arc::ptr_eq(&endpoint.pipe, pipe)
&& (!require_read || endpoint.kind.can_read())
&& (!require_write || endpoint.kind.can_write()) =>
{
Some(*id)
}
_ => None,
})
.collect::<Vec<_>>()
};
for id in ids {
event::trigger(GlobalSchemes::Pipe.scheme_id(), id, flags, token);
}
}
fn open_endpoint(
pipe: Arc<Pipe>,
kind: EndpointKind,
named: Option<Arc<NamedPipe>>,
token: &mut CleanLockToken,
) -> usize {
if kind.can_read() {
pipe.reader_count.fetch_add(1, Ordering::SeqCst);
}
if kind.can_write() {
pipe.writer_count.fetch_add(1, Ordering::SeqCst);
}
let id = next_id();
HANDLES.write(token.token()).insert(
id,
Handle::Endpoint(EndpointHandle { pipe, kind, named }),
);
id
}
fn drop_wait_conditions_if_possible(pipe: Arc<Pipe>, token: &mut CleanLockToken) {
if let Some(pipe) = Arc::into_inner(pipe) {
{
pipe.read_condition.into_drop(token);
}
{
pipe.write_condition.into_drop(token);
}
}
}
pub fn pipe(token: &mut CleanLockToken) -> Result<(usize, usize)> {
// Bit 0 is used for WRITE_NOT_READ_BIT
let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed);
let pipe = Arc::new(Pipe::new());
let read_id = open_endpoint(Arc::clone(&pipe), EndpointKind::Read, None, token);
let write_id = open_endpoint(pipe, EndpointKind::Write, None, token);
PIPES.write(token.token()).insert(
id,
Handle::Pipe(Arc::new(Pipe {
queue: Mutex::new(VecDeque::new()),
read_condition: WaitCondition::new(),
write_condition: WaitCondition::new(),
writer_is_alive: AtomicBool::new(true),
reader_is_alive: AtomicBool::new(true),
has_run_dup: AtomicBool::new(false),
fd_queue: Mutex::new(VecDeque::new()),
})),
);
Ok((read_id, write_id))
}
Ok((id, id | WRITE_NOT_READ_BIT))
pub fn named_pipe_exists(path: &str, token: &mut CleanLockToken) -> bool {
NAMED_PIPES.read(token.token()).contains_key(path)
}
pub fn create_named_pipe(
path: &str,
display_path: &str,
mode: u16,
flags: usize,
token: &mut CleanLockToken,
) -> Result<usize> {
validate_named_fifo_open(flags)?;
let named = {
let mut named_pipes = NAMED_PIPES.write(token.token());
if named_pipes.contains_key(path) {
return Err(Error::new(EEXIST));
}
let named = Arc::new(NamedPipe {
path: display_path.to_string(),
mode,
active: Mutex::new(None),
});
named_pipes.insert(path.to_string(), Arc::clone(&named));
named
};
let kind = endpoint_kind_from_flags(flags)?;
let pipe = Arc::new(Pipe::new());
*named.active.lock(token.token()) = Some(Arc::clone(&pipe));
Ok(open_endpoint(pipe, kind, Some(named), token))
}
pub fn open_named_pipe(path: &str, flags: usize, token: &mut CleanLockToken) -> Result<Option<usize>> {
validate_named_fifo_open(flags)?;
let named = match NAMED_PIPES.read(token.token()).get(path) {
Some(named) => Arc::clone(named),
None => return Ok(None),
};
let kind = endpoint_kind_from_flags(flags)?;
let pipe = {
let mut active = named.active.lock(token.token());
match active.as_ref() {
Some(pipe) => Arc::clone(pipe),
None => {
let pipe = Arc::new(Pipe::new());
*active = Some(Arc::clone(&pipe));
pipe
}
}
};
Ok(Some(open_endpoint(pipe, kind, Some(named), token)))
}
pub fn unlink_named_pipe(path: &str, token: &mut CleanLockToken) -> bool {
NAMED_PIPES.write(token.token()).remove(path).is_some()
}
pub struct PipeScheme;
impl PipeScheme {
fn get_pipe(key: usize, token: &mut CleanLockToken) -> Result<Arc<Pipe>> {
PIPES
fn get_endpoint(id: usize, token: &mut CleanLockToken) -> Result<EndpointHandle> {
HANDLES
.read(token.token())
.get(&key)
.get(&id)
.and_then(|handle| match handle {
Handle::Pipe(pipe) => Some(Arc::clone(pipe)),
_ => None,
Handle::Endpoint(endpoint) => Some(endpoint.clone()),
Handle::SchemeRoot => None,
})
.ok_or(Error::new(EBADF))
}
@@ -82,32 +248,33 @@ impl PipeScheme {
impl KernelScheme for PipeScheme {
fn scheme_root(&self, token: &mut CleanLockToken) -> Result<usize> {
let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed);
PIPES.write(token.token()).insert(id, Handle::SchemeRoot);
let id = next_id();
HANDLES.write(token.token()).insert(id, Handle::SchemeRoot);
Ok(id)
}
fn fevent(
&self,
id: usize,
flags: EventFlags,
token: &mut CleanLockToken,
) -> Result<EventFlags> {
let (is_writer_not_reader, key) = from_raw_id(id);
let pipe = Self::get_pipe(key, token)?;
let endpoint = Self::get_endpoint(id, token)?;
let mut ready = EventFlags::empty();
if is_writer_not_reader
if endpoint.kind.can_write()
&& flags.contains(EVENT_WRITE)
&& (pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE
|| !pipe.reader_is_alive.load(Ordering::Acquire))
&& (endpoint.pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE
|| endpoint.pipe.reader_count.load(Ordering::Acquire) == 0)
{
ready |= EventFlags::EVENT_WRITE;
}
if !is_writer_not_reader
if endpoint.kind.can_read()
&& flags.contains(EVENT_READ)
&& (!pipe.queue.lock(token.token()).is_empty()
|| !pipe.writer_is_alive.load(Ordering::Acquire))
&& (!endpoint.pipe.queue.lock(token.token()).is_empty()
|| endpoint.pipe.writer_count.load(Ordering::Acquire) == 0)
{
ready |= EventFlags::EVENT_READ;
}
@@ -116,46 +283,48 @@ impl KernelScheme for PipeScheme {
}
fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
let (is_write_not_read, key) = from_raw_id(id);
let handle = HANDLES
.write(token.token())
.remove(&id)
.ok_or(Error::new(EBADF))?;
let pipe = Self::get_pipe(key, token)?;
let scheme_id = GlobalSchemes::Pipe.scheme_id();
let can_remove = if is_write_not_read {
pipe.writer_is_alive.store(false, Ordering::SeqCst);
event::trigger(scheme_id, key, EVENT_READ, token);
pipe.read_condition.notify(token);
!pipe.reader_is_alive.load(Ordering::SeqCst)
} else {
pipe.reader_is_alive.store(false, Ordering::SeqCst);
event::trigger(scheme_id, key | WRITE_NOT_READ_BIT, EVENT_WRITE, token);
pipe.write_condition.notify(token);
!pipe.writer_is_alive.load(Ordering::SeqCst)
let Handle::Endpoint(endpoint) = handle else {
return Ok(());
};
if can_remove {
let handle = PIPES.write(token.token()).remove(&key);
if let Some(Handle::Pipe(pipe)) = handle
&& let Some(pipe) = Arc::into_inner(pipe)
{
{
pipe.read_condition.into_drop(token);
}
{
pipe.write_condition.into_drop(token);
}
}
let mut last_reader = false;
let mut last_writer = false;
if endpoint.kind.can_read() {
last_reader = endpoint.pipe.reader_count.fetch_sub(1, Ordering::SeqCst) == 1;
}
if endpoint.kind.can_write() {
last_writer = endpoint.pipe.writer_count.fetch_sub(1, Ordering::SeqCst) == 1;
}
if let Some(pipe) = Arc::into_inner(pipe) {
{
pipe.read_condition.into_drop(token);
}
{
pipe.write_condition.into_drop(token);
if last_writer {
trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
endpoint.pipe.read_condition.notify(token);
}
if last_reader {
trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
endpoint.pipe.write_condition.notify(token);
}
let no_readers = endpoint.pipe.reader_count.load(Ordering::SeqCst) == 0;
let no_writers = endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0;
if no_readers && no_writers {
if let Some(named) = endpoint.named {
let mut active = named.active.lock(token.token());
if active
.as_ref()
.is_some_and(|active_pipe| Arc::ptr_eq(active_pipe, &endpoint.pipe))
{
*active = None;
}
}
drop_wait_conditions_if_possible(endpoint.pipe, token);
}
Ok(())
@@ -168,9 +337,9 @@ impl KernelScheme for PipeScheme {
_ctx: CallerCtx,
token: &mut CleanLockToken,
) -> Result<OpenResult> {
let (is_writer_not_reader, key) = from_raw_id(old_id);
let endpoint = Self::get_endpoint(old_id, token)?;
if is_writer_not_reader {
if !endpoint.kind.can_read() {
return Err(Error::new(EBADF));
}
@@ -180,17 +349,17 @@ impl KernelScheme for PipeScheme {
return Err(Error::new(EINVAL));
}
let pipe = Self::get_pipe(key, token)?;
if pipe.has_run_dup.swap(true, Ordering::SeqCst) {
return Err(Error::new(EBADF));
}
Ok(OpenResult::SchemeLocal(
key | WRITE_NOT_READ_BIT,
open_endpoint(
Arc::clone(&endpoint.pipe),
EndpointKind::Write,
endpoint.named,
token,
),
InternalFlags::empty(),
))
}
fn kopenat(
&self,
id: usize,
@@ -200,40 +369,47 @@ impl KernelScheme for PipeScheme {
_ctx: CallerCtx,
token: &mut CleanLockToken,
) -> Result<OpenResult> {
let (_, key) = from_raw_id(id);
let is_scheme_root = {
let handles = HANDLES.read(token.token());
match handles.get(&id) {
Some(Handle::SchemeRoot) => true,
Some(Handle::Endpoint(_)) => false,
None => return Err(Error::new(EBADF)),
}
};
{
let guard = PIPES.read(token.token());
if let Some(Handle::SchemeRoot) = guard.get(&key) {
} else if let Some(Handle::Pipe(pipe_arc)) = guard.get(&key) {
let pipe = Arc::clone(pipe_arc);
drop(guard);
if user_buf.as_bytes() == b"write" {
return Err(Error::new(EINVAL));
}
if pipe.has_run_dup.swap(true, Ordering::SeqCst) {
return Err(Error::new(EBADF));
if is_scheme_root {
let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?;
if !path.trim_start_matches('/').is_empty() {
return Err(Error::new(ENOENT));
}
let pipe = Arc::new(Pipe::new());
return Ok(OpenResult::SchemeLocal(
key | WRITE_NOT_READ_BIT,
open_endpoint(pipe, EndpointKind::Read, None, token),
InternalFlags::empty(),
));
} else {
return Err(Error::new(EBADF));
}
}
let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?;
if !path.trim_start_matches('/').is_empty() {
return Err(Error::new(ENOENT));
let endpoint = Self::get_endpoint(id, token)?;
if !endpoint.kind.can_read() {
return Err(Error::new(EBADF));
}
let (read_id, _) = pipe(token)?;
let path = user_buf.as_bytes();
if !path.is_empty() && path != b"write" {
return Err(Error::new(EINVAL));
}
Ok(OpenResult::SchemeLocal(read_id, InternalFlags::empty()))
Ok(OpenResult::SchemeLocal(
open_endpoint(
Arc::clone(&endpoint.pipe),
EndpointKind::Write,
endpoint.named,
token,
),
InternalFlags::empty(),
))
}
fn kread(
@@ -244,16 +420,15 @@ impl KernelScheme for PipeScheme {
_stored_flags: u32,
token: &mut CleanLockToken,
) -> Result<usize> {
let (is_write_not_read, key) = from_raw_id(id);
let endpoint = Self::get_endpoint(id, token)?;
if is_write_not_read {
if !endpoint.kind.can_read() {
return Err(Error::new(EBADF));
}
let pipe = Self::get_pipe(key, token)?;
loop {
let vec = pipe.queue.lock(token.token());
let (mut vec, mut token) = vec.into_split();
let vec = endpoint.pipe.queue.lock(token.token());
let (mut vec, mut lock_token) = vec.into_split();
let (s1, s2) = vec.as_slices();
let s1_count = core::cmp::min(user_buf.len(), s1.len());
@@ -273,28 +448,34 @@ impl KernelScheme for PipeScheme {
let _ = vec.drain(..bytes_read);
if bytes_read > 0 {
event::trigger_locked(
GlobalSchemes::Pipe.scheme_id(),
key | WRITE_NOT_READ_BIT,
EVENT_WRITE,
token.token(),
);
pipe.write_condition.notify_locked(token.token());
drop(vec);
drop(lock_token);
trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
endpoint.pipe.write_condition.notify(token);
return Ok(bytes_read);
} else if user_buf.is_empty() {
}
if user_buf.is_empty() {
return Ok(0);
}
if !pipe.writer_is_alive.load(Ordering::SeqCst) {
if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 {
return Ok(0);
} else if fcntl_flags & O_NONBLOCK as u32 != 0 {
}
if fcntl_flags & O_NONBLOCK as u32 != 0 {
return Err(Error::new(EAGAIN));
} else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) {
}
if !endpoint
.pipe
.read_condition
.wait(vec, "PipeRead::read", &mut lock_token)
{
return Err(Error::new(EINTR));
}
}
}
fn kwrite(
&self,
id: usize,
@@ -303,18 +484,17 @@ impl KernelScheme for PipeScheme {
_stored_flags: u32,
token: &mut CleanLockToken,
) -> Result<usize> {
let (is_write_not_read, key) = from_raw_id(id);
let endpoint = Self::get_endpoint(id, token)?;
if !is_write_not_read {
if !endpoint.kind.can_write() {
return Err(Error::new(EBADF));
}
let pipe = Self::get_pipe(key, token)?;
loop {
let vec = pipe.queue.lock(token.token());
let (mut vec, mut token) = vec.into_split();
let vec = endpoint.pipe.queue.lock(token.token());
let (mut vec, mut lock_token) = vec.into_split();
if !pipe.reader_is_alive.load(Ordering::Relaxed) {
if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 {
return Err(Error::new(EPIPE));
}
@@ -329,7 +509,6 @@ impl KernelScheme for PipeScheme {
let mut bytes_written = 0;
// TODO: Modify VecDeque so that the unwritten portions can be accessed directly?
for (idx, chunk) in src_buf.in_variable_chunks(TMPBUF_SIZE).enumerate() {
let chunk_byte_count = match chunk.copy_common_bytes_to_slice(&mut tmp_buf) {
Ok(c) => c,
@@ -341,41 +520,52 @@ impl KernelScheme for PipeScheme {
}
if bytes_written > 0 {
event::trigger_locked(
GlobalSchemes::Pipe.scheme_id(),
key,
EVENT_READ,
token.token(),
);
pipe.read_condition.notify_locked(token.token());
drop(vec);
drop(lock_token);
trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
endpoint.pipe.read_condition.notify(token);
return Ok(bytes_written);
} else if user_buf.is_empty() {
}
if user_buf.is_empty() {
return Ok(0);
}
if fcntl_flags & O_NONBLOCK as u32 != 0 {
return Err(Error::new(EAGAIN));
} else if !pipe
}
if !endpoint
.pipe
.write_condition
.wait(vec, "PipeWrite::write", &mut token)
.wait(vec, "PipeWrite::write", &mut lock_token)
{
return Err(Error::new(EINTR));
}
}
}
fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<usize> {
//TODO: construct useful path?
buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes())
fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<usize> {
let endpoint = Self::get_endpoint(id, token)?;
if let Some(named) = endpoint.named {
buf.copy_common_bytes_from_slice(named.path.as_bytes())
} else {
buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes())
}
}
fn kfstat(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<()> {
fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> {
let endpoint = Self::get_endpoint(id, token)?;
let mode = endpoint.named.map_or(0o666, |named| named.mode);
buf.copy_exactly(&Stat {
st_mode: MODE_FIFO | 0o666,
st_mode: MODE_FIFO | mode,
..Default::default()
})?;
Ok(())
}
fn kfdwrite(
&self,
id: usize,
@@ -385,23 +575,17 @@ impl KernelScheme for PipeScheme {
_metadata: &[u64],
token: &mut CleanLockToken,
) -> Result<usize> {
let (is_write_not_read, key) = from_raw_id(id);
let endpoint = Self::get_endpoint(id, token)?;
if !is_write_not_read {
if !endpoint.kind.can_write() {
return Err(Error::new(EBADF));
}
let pipe = match Self::get_pipe(key, token) {
Ok(p) => p,
Err(e) => {
return Err(e);
}
};
loop {
let vec = pipe.fd_queue.lock(token.token());
let (mut vec, mut token) = vec.into_split();
let vec = endpoint.pipe.fd_queue.lock(token.token());
let (mut vec, mut lock_token) = vec.into_split();
if !pipe.reader_is_alive.load(Ordering::Relaxed) {
if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 {
return Err(Error::new(EPIPE));
}
if descs.is_empty() {
@@ -421,25 +605,24 @@ impl KernelScheme for PipeScheme {
let fds_written = vec.len() - before_len;
if fds_written > 0 {
event::trigger_locked(
GlobalSchemes::Pipe.scheme_id(),
key,
EVENT_READ,
token.token(),
);
pipe.read_condition.notify_locked(token.token());
drop(vec);
drop(lock_token);
trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
endpoint.pipe.read_condition.notify(token);
return Ok(fds_written);
}
if !pipe
if !endpoint
.pipe
.write_condition
.wait(vec, "PipeWrite::write", &mut token)
.wait(vec, "PipeWrite::write", &mut lock_token)
{
return Err(Error::new(EINTR));
}
}
}
fn kfdread(
&self,
id: usize,
@@ -448,25 +631,19 @@ impl KernelScheme for PipeScheme {
_metadata: &[u64],
token: &mut CleanLockToken,
) -> Result<usize> {
let (is_write_not_read, key) = from_raw_id(id);
let endpoint = Self::get_endpoint(id, token)?;
if is_write_not_read {
if !endpoint.kind.can_read() {
return Err(Error::new(EBADF));
}
let pipe = match Self::get_pipe(key, token) {
Ok(p) => p,
Err(e) => {
return Err(e);
}
};
if payload.is_empty() {
return Ok(0);
}
loop {
let vec = pipe.fd_queue.lock(token.token());
let (mut vec, mut token) = vec.into_split();
let vec = endpoint.pipe.fd_queue.lock(token.token());
let (mut vec, mut lock_token) = vec.into_split();
let fds_available = vec.len();
let max_fds_read = payload.len() / size_of::<usize>();
@@ -479,31 +656,33 @@ impl KernelScheme for PipeScheme {
fds_to_transfer,
payload,
flags.contains(CallFlags::FD_CLOEXEC),
&mut token,
&mut lock_token,
)?;
} else {
bulk_add_fds(
fds_to_transfer,
payload,
flags.contains(CallFlags::FD_CLOEXEC),
&mut token,
&mut lock_token,
)?;
}
event::trigger_locked(
GlobalSchemes::Pipe.scheme_id(),
key | WRITE_NOT_READ_BIT,
EVENT_WRITE,
token.token(),
);
pipe.write_condition.notify_locked(token.token());
drop(vec);
drop(lock_token);
trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
endpoint.pipe.write_condition.notify(token);
return Ok(fds_to_read);
}
if !pipe.writer_is_alive.load(Ordering::SeqCst) {
if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 {
return Ok(0);
} else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) {
}
if !endpoint
.pipe
.read_condition
.wait(vec, "PipeRead::read", &mut lock_token)
{
return Err(Error::new(EINTR));
}
}
@@ -511,11 +690,23 @@ impl KernelScheme for PipeScheme {
}
pub struct Pipe {
read_condition: WaitCondition, // signals whether there are available bytes to read
write_condition: WaitCondition, // signals whether there is room for additional bytes
read_condition: WaitCondition,
write_condition: WaitCondition,
queue: Mutex<L1, VecDeque<u8>>,
reader_is_alive: AtomicBool, // starts set, unset when reader closes
writer_is_alive: AtomicBool, // starts set, unset when writer closes
has_run_dup: AtomicBool,
reader_count: AtomicUsize,
writer_count: AtomicUsize,
fd_queue: Mutex<L1, VecDeque<Arc<LockedFileDescription>>>,
}
impl Pipe {
fn new() -> Self {
Self {
read_condition: WaitCondition::new(),
write_condition: WaitCondition::new(),
queue: Mutex::new(VecDeque::new()),
reader_count: AtomicUsize::new(0),
writer_count: AtomicUsize::new(0),
fd_queue: Mutex::new(VecDeque::new()),
}
}
}
+59 -12
View File
@@ -105,6 +105,7 @@ enum ContextHandle {
// Attr handles, to set ens/euid/egid/pid.
Authority,
Attr,
Groups,
Status {
privileged: bool,
@@ -261,6 +262,7 @@ impl ProcScheme {
let handle = match actual_name {
"attrs" => ContextHandle::Attr,
"status" => ContextHandle::Status { privileged: true },
"groups" => ContextHandle::Groups,
_ => return Err(Error::new(ENOENT)),
};
@@ -306,6 +308,11 @@ impl ProcScheme {
let id = NonZeroUsize::new(NEXT_ID.fetch_add(1, Ordering::Relaxed))
.ok_or(Error::new(EMFILE))?;
let context = context::spawn(true, Some(id), ret, token)?;
{
let parent_groups =
context::current().read(token.token()).groups.clone();
context.write(token.token()).groups = parent_groups;
}
HANDLES.write(token.token()).insert(
id.get(),
Handle {
@@ -425,6 +432,7 @@ impl KernelScheme for ProcScheme {
}
fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
let mut inner_token = unsafe { CleanLockToken::new() };
let handle = HANDLES
.write(token.token())
.remove(&id)
@@ -452,9 +460,7 @@ impl KernelScheme for ProcScheme {
))]
regs.set_arg1(arg1);
// TODO: Lock ordering violation
let mut token = unsafe { CleanLockToken::new() };
Ok(context.set_addr_space(Some(new), token.downgrade()))
Ok(context.set_addr_space(Some(new), inner_token.downgrade()))
})?;
if let Some(old_ctx) = old_ctx
&& let Some(addrspace) = Arc::into_inner(old_ctx)
@@ -493,6 +499,7 @@ impl KernelScheme for ProcScheme {
consume: bool,
token: &mut CleanLockToken,
) -> Result<usize> {
let mut inner_token = unsafe { CleanLockToken::new() };
let handle = HANDLES
.read(token.token())
.get(&id)
@@ -583,9 +590,7 @@ impl KernelScheme for ProcScheme {
};
// TODO: Allocated or AllocatedShared?
let addrsp = AddrSpace::current()?;
// TODO: Lock ordering violation
let mut token = unsafe { CleanLockToken::new() };
let page = addrsp.acquire_write(token.downgrade()).mmap_anywhere(
let page = addrsp.acquire_write(inner_token.downgrade()).mmap_anywhere(
&addrsp,
NonZeroUsize::new(1).unwrap(),
MapFlags::PROT_READ | MapFlags::PROT_WRITE,
@@ -849,17 +854,17 @@ impl KernelScheme for ProcScheme {
}
}
fn extract_scheme_number(fd: usize, token: &mut CleanLockToken) -> Result<(KernelSchemes, usize)> {
let (scheme_id, number) = {
let desc = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut token) = current.token_split();
let (context, mut context_token) = current.token_split();
let file_descriptor = context
.get_file(FileHandle::from(fd), &mut token)
.get_file(FileHandle::from(fd), &mut context_token)
.ok_or(Error::new(EBADF))?;
let desc = file_descriptor.description.read(token.token());
(desc.scheme, desc.number)
*file_descriptor.description.read(context_token.token())
};
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
let scheme = desc.get_scheme(token)?;
let number = desc.number;
Ok((scheme, number))
}
@@ -1271,6 +1276,39 @@ impl ContextHandle {
guard.prio = (info.prio as usize).min(39);
Ok(size_of::<ProcSchemeAttrs>())
}
Self::Groups => {
const NGROUPS_MAX: usize = 65536;
if buf.len() % size_of::<u32>() != 0 {
return Err(Error::new(EINVAL));
}
let count = buf.len() / size_of::<u32>();
if count > NGROUPS_MAX {
return Err(Error::new(EINVAL));
}
let mut groups = Vec::with_capacity(count);
for chunk in buf.in_exact_chunks(size_of::<u32>()).take(count) {
groups.push(chunk.read_u32()?);
}
let proc_id = {
let guard = context.read(token.token());
guard.owner_proc_id
};
{
let mut guard = context.write(token.token());
guard.groups = groups.clone();
}
if let Some(pid) = proc_id {
let mut contexts = context::contexts(token.downgrade());
let (contexts, mut t) = contexts.token_split();
for context_ref in contexts.iter() {
let mut ctx = context_ref.write(t.token());
if ctx.owner_proc_id == Some(pid) {
ctx.groups = groups.clone();
}
}
}
Ok(count * size_of::<u32>())
}
ContextHandle::OpenViaDup => {
let mut args = buf.usizes();
@@ -1475,6 +1513,15 @@ impl ContextHandle {
debug_name,
})
}
Self::Groups => {
let c = &context.read(token.token());
let max = buf.len() / size_of::<u32>();
let count = c.groups.len().min(max);
for (chunk, gid) in buf.in_exact_chunks(size_of::<u32>()).zip(&c.groups).take(count) {
chunk.copy_from_slice(&gid.to_ne_bytes())?;
}
Ok(count * size_of::<u32>())
}
ContextHandle::Sighandler => {
let data = match context.read(token.token()).sig {
Some(ref sig) => SetSighandlerData {
+157 -63
View File
@@ -80,6 +80,7 @@ const ONE: NonZeroUsize = match NonZeroUsize::new(1) {
Some(one) => one,
None => unreachable!(),
};
const MAX_SPURIOUS_WAKEUPS: usize = 100;
enum ParsedCqe {
TriggerFevent {
@@ -209,6 +210,8 @@ impl UserInner {
caller_responsible: &mut PageSpan,
token: &mut CleanLockToken,
) -> Result<Response> {
let mut remaining_spurious_wakeups = MAX_SPURIOUS_WAKEUPS;
{
// Disable preemption to avoid context switches between setting the
// process state and sending the scheme request. The process is made
@@ -261,7 +264,10 @@ impl UserInner {
};
let states = self.states.lock(token.token());
let (mut states, mut token) = states.into_split();
let (mut states, mut state_token) = states.into_split();
let mut timed_out_descriptions = None;
let mut remove_state = false;
let mut timed_out = false;
match states.get_mut(sqe.tag as usize) {
// invalid state
None => return Err(Error::new(EBADFD)),
@@ -274,24 +280,35 @@ impl UserInner {
fds,
} => {
let maybe_eintr =
eintr_if_sigkill(&mut callee_responsible, &mut token.token());
*o = State::Waiting {
canceling: true,
callee_responsible,
context,
fds,
};
eintr_if_sigkill(&mut callee_responsible, &mut state_token.token());
if maybe_eintr.is_ok() {
remaining_spurious_wakeups =
remaining_spurious_wakeups.saturating_sub(1);
}
if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 {
timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds));
remove_state = true;
} else {
*o = State::Waiting {
canceling: true,
callee_responsible,
context,
fds,
};
}
maybe_eintr?;
context::current()
.write(token.token())
.block("UserInner::call (woken up after cancelation request)");
// We do not want to drop the lock before blocking
// as if we get preempted in between we might miss a
// wakeup.
drop(states);
if remove_state {
states.remove(sqe.tag as usize);
timed_out = true;
} else {
context::current()
.write(state_token.token())
.block("UserInner::call (woken up after cancelation request)");
}
}
// spurious wakeup
State::Waiting {
@@ -300,60 +317,76 @@ impl UserInner {
context,
mut callee_responsible,
} => {
let maybe_eintr = eintr_if_sigkill(&mut callee_responsible, &mut token);
let current_context = context::current();
let maybe_eintr =
eintr_if_sigkill(&mut callee_responsible, &mut state_token);
*o = State::Waiting {
// Currently we treat all spurious wakeups to have the same behavior
// as signals (i.e., we send a cancellation request). It is not something
// that should happen, but it certainly can happen, for example if a context
// is awoken through its thread handle without setting any sig bits, or if the
// caller clears its own sig bits. If it actually is a signal, then it is the
// intended behavior.
canceling: true,
fds,
context,
callee_responsible,
};
if maybe_eintr.is_ok() {
remaining_spurious_wakeups =
remaining_spurious_wakeups.saturating_sub(1);
}
if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 {
timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds));
remove_state = true;
} else {
*o = State::Waiting {
// Currently we treat all spurious wakeups to have the same behavior
// as signals (i.e., we send a cancellation request). It is not something
// that should happen, but it certainly can happen, for example if a context
// is awoken through its thread handle without setting any sig bits, or if the
// caller clears its own sig bits. If it actually is a signal, then it is the
// intended behavior.
canceling: true,
fds,
context,
callee_responsible,
};
}
maybe_eintr?;
// We do not want to preempt between sending the
// cancellation and blocking again where we might
// miss a wakeup.
let mut preempt = PreemptGuardL1::new(&current_context, &mut token);
let token = preempt.token();
if remove_state {
states.remove(sqe.tag as usize);
timed_out = true;
} else {
// We do not want to preempt between sending the
// cancellation and blocking again where we might
// miss a wakeup.
let mut preempt =
PreemptGuardL1::new(&current_context, &mut state_token);
let token = preempt.token();
self.todo.send_locked(
Sqe {
opcode: Opcode::Cancel as u8,
sqe_flags: SqeFlags::ONEWAY,
tag: sqe.tag,
..Default::default()
},
token.token(),
);
event::trigger_locked(
self.root_id,
self.scheme_id.get(),
EVENT_READ,
token.token(),
);
self.todo.send_locked(
Sqe {
opcode: Opcode::Cancel as u8,
sqe_flags: SqeFlags::ONEWAY,
tag: sqe.tag,
..Default::default()
},
token.token(),
);
event::trigger_locked(
self.root_id,
self.scheme_id.get(),
EVENT_READ,
token.token(),
);
// 1. If cancellation was requested and arrived
// before the scheme processed the request, an
// acknowledgement will be sent back after the
// cancellation is processed and we will be woken up
// again. State will be State::Responded then.
//
// 2. If cancellation was requested but the scheme
// already processed the request, we will receive
// the actual response next and woken up again.
// State will be State::Responded then.
context::current()
.write(token.token())
.block("UserInner::call (spurious wakeup)");
drop(states);
// 1. If cancellation was requested and arrived
// before the scheme processed the request, an
// acknowledgement will be sent back after the
// cancellation is processed and we will be woken up
// again. State will be State::Responded then.
//
// 2. If cancellation was requested but the scheme
// already processed the request, we will receive
// the actual response next and woken up again.
// State will be State::Responded then.
context::current()
.write(token.token())
.block("UserInner::call (spurious wakeup)");
}
}
// invalid state
@@ -368,10 +401,70 @@ impl UserInner {
}
},
}
if let Some(descriptions) = timed_out_descriptions {
drop(states);
for desc in descriptions {
let _ = desc.try_close(token);
}
}
if timed_out {
return Err(Error::new(ETIMEDOUT));
}
}
}
}
fn collect_descriptions_to_close(
fds: Vec<Arc<LockedFileDescription>>,
) -> Vec<FileDescription> {
fds.into_iter()
.filter_map(|fd| Arc::try_unwrap(fd).ok())
.map(RwLock::into_inner)
.collect()
}
pub fn fail_pending_calls(&self, token: &mut CleanLockToken) {
let descriptions_to_close = {
let mut states_lock = self.states.lock(token.token());
let (states, mut lock_token) = states_lock.token_split();
let mut descriptions_to_close = Vec::new();
let mut states_to_remove = Vec::new();
for (id, state) in states.iter_mut() {
match mem::replace(state, State::Placeholder) {
State::Waiting { context, fds, .. } => {
descriptions_to_close.extend(Self::collect_descriptions_to_close(fds));
match context.upgrade() {
Some(context) => {
*state = State::Responded(Response::Regular(
Err(Error::new(ENODEV)),
0,
false,
));
context.write(lock_token.token()).unblock();
}
None => states_to_remove.push(id),
}
}
old_state => *state = old_state,
}
}
for id in states_to_remove {
states.remove(id);
}
descriptions_to_close
};
for desc in descriptions_to_close {
let _ = desc.try_close(token);
}
}
/// Map a readable structure to the scheme's userspace and return the
/// pointer
#[must_use = "copying back to head/tail buffers can fail"]
@@ -1283,6 +1376,7 @@ impl UserInner {
}
pub fn into_drop(self, token: &mut CleanLockToken) {
self.fail_pending_calls(token);
self.todo.condition.into_drop(token);
}
}
@@ -74,14 +74,16 @@ impl MemoryEntry {
}
struct MemoryMap {
entries: [MemoryEntry; 512],
entries: [MemoryEntry; 1024],
size: usize,
}
impl MemoryMap {
fn register(&mut self, base: usize, size: usize, kind: BootloaderMemoryKind) {
if self.size >= self.entries.len() {
panic!("Early memory map overflow!");
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'!', options(nostack, preserves_flags)); }
panic!("Early memory map overflow at entry {} (max {})", self.size, self.entries.len());
}
let start = if kind == BootloaderMemoryKind::Free {
align_up(base)
@@ -134,7 +136,7 @@ static MEMORY_MAP: SyncUnsafeCell<MemoryMap> = SyncUnsafeCell::new(MemoryMap {
start: 0,
end: 0,
kind: BootloaderMemoryKind::Null,
}; 512],
}; 1024],
size: 0,
});
@@ -323,7 +325,16 @@ unsafe fn map_memory<A: Arch>(areas: &[MemoryArea], mut bump_allocator: &mut Bum
}
}
let kernel_area = (*MEMORY_MAP.get()).kernel().unwrap();
let kernel_area = match (*MEMORY_MAP.get()).kernel() {
Some(area) => area,
None => {
println!("FATAL: kernel memory area not found in boot memory map");
println!("Cannot determine kernel base address. Halting.");
loop {
core::hint::spin_loop();
}
}
};
let kernel_base = kernel_area.start;
let kernel_size = kernel_area.end.saturating_sub(kernel_area.start);
// Map kernel at KERNEL_OFFSET
+10 -3
View File
@@ -149,6 +149,15 @@ static BOOTSTRAP: spin::Once<Bootstrap> = spin::Once::new();
pub(crate) static AP_READY: AtomicBool = AtomicBool::new(false);
static BSP_READY: AtomicBool = AtomicBool::new(false);
#[cold]
fn halt_boot(message: &str) -> ! {
print!("{message}");
println!("Kernel boot cannot continue. Halting.");
loop {
hint::spin_loop();
}
}
/// This is the kernel entry point for the primary CPU. The arch crate is responsible for calling this
pub(crate) fn kmain(bootstrap: Bootstrap) -> ! {
let mut token = unsafe { CleanLockToken::new() };
@@ -180,9 +189,7 @@ pub(crate) fn kmain(bootstrap: Bootstrap) -> ! {
context.euid = 0;
context.egid = 0;
}
Err(err) => {
panic!("failed to spawn userspace_init: {:?}", err);
}
Err(_err) => halt_boot("FATAL: failed to spawn first userspace process userspace_init\n"),
}
run_userspace(&mut token)
+188
View File
@@ -0,0 +1,188 @@
//! MCS (Mellor-Crummey Scott) fair spinlock.
//!
//! Each waiter spins on its own local `locked` flag instead of a shared lock
//! word, eliminating cache-line bouncing under contention. FIFO ordering
//! guarantees fairness. O(1) cache-line transfers on unlock.
//!
//! Supports transitive priority inheritance: when CPU A waits on a lock held
//! by CPU B, and CPU B waits on a lock held by CPU C, A's priority is
//! propagated through the chain to C (up to MAX_PI_CHAIN_DEPTH hops).
use core::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering};
use core::{hint, ptr};
use crate::percpu::PercpuBlock;
/// Maximum depth for transitive priority inheritance chain following.
/// Prevents infinite loops from theoretical lock cycles and bounds latency.
/// Linux uses 20; 8 is conservative for a microkernel with fewer nesting levels.
const MAX_PI_CHAIN_DEPTH: u32 = 8;
/// A node in the MCS lock queue.
pub struct McsNode {
pub next: AtomicPtr<McsNode>,
pub locked: AtomicBool,
}
impl McsNode {
pub const fn new() -> Self {
Self {
next: AtomicPtr::new(ptr::null_mut()),
locked: AtomicBool::new(false),
}
}
}
/// Raw MCS spinlock primitive.
pub struct McsRawLock {
tail: AtomicPtr<McsNode>,
/// CPU ID of the current lock holder (for priority inheritance).
/// `u32::MAX` means no holder.
holder_cpu: AtomicU32,
}
impl McsRawLock {
pub const fn new() -> Self {
Self {
tail: AtomicPtr::new(ptr::null_mut()),
holder_cpu: AtomicU32::new(u32::MAX),
}
}
#[inline]
pub fn acquire(&self, node: &McsNode) -> bool {
node.next.store(ptr::null_mut(), Ordering::Relaxed);
node.locked.store(true, Ordering::Relaxed);
let prev = self.tail.swap((node as *const McsNode).cast_mut(), Ordering::AcqRel);
if prev.is_null() {
// Uncontended — record ourselves as holder
let cpu_id = PercpuBlock::current().cpu_id.get();
self.holder_cpu.store(cpu_id, Ordering::Release);
return false;
}
unsafe {
(*prev).next.store((node as *const McsNode).cast_mut(), Ordering::Release);
}
let percpu = PercpuBlock::current();
// Record which lock we're spinning on (for transitive PI chain following)
percpu.waiting_on_lock.store(
(self as *const McsRawLock).cast_mut(),
Ordering::Release,
);
let mut donated = false;
while node.locked.load(Ordering::Acquire) {
percpu.maybe_handle_tlb_shootdown();
// Donate priority to the lock holder (transitively) once per acquisition
if !donated {
self.maybe_donate_priority(percpu);
donated = true;
}
hint::spin_loop();
}
// Clear waiting_on_lock before proceeding — we now hold the lock
percpu.waiting_on_lock.store(ptr::null_mut(), Ordering::Release);
self.holder_cpu.store(percpu.cpu_id.get(), Ordering::Release);
true
}
#[inline]
pub fn release(&self, node: &McsNode) {
// Clear priority inheritance donation — we no longer hold the lock
PercpuBlock::current().pi_donated_prio.store(u32::MAX, Ordering::Release);
// Clear holder CPU
self.holder_cpu.store(u32::MAX, Ordering::Release);
let next = node.next.load(Ordering::Acquire);
if next.is_null() {
if self
.tail
.compare_exchange(
(node as *const McsNode).cast_mut(),
ptr::null_mut(),
Ordering::AcqRel,
Ordering::Acquire,
)
.is_ok()
{
return;
}
while node.next.load(Ordering::Acquire).is_null() {
hint::spin_loop();
}
}
unsafe {
(*node.next.load(Ordering::Acquire)).locked.store(false, Ordering::Release);
}
}
#[inline]
pub fn try_acquire(&self, node: &McsNode) -> bool {
node.next.store(ptr::null_mut(), Ordering::Relaxed);
node.locked.store(true, Ordering::Relaxed);
let ok = self
.tail
.compare_exchange(
ptr::null_mut(),
(node as *const McsNode).cast_mut(),
Ordering::AcqRel,
Ordering::Acquire,
)
.is_ok();
if ok {
let cpu_id = PercpuBlock::current().cpu_id.get();
self.holder_cpu.store(cpu_id, Ordering::Release);
}
ok
}
/// Donate current CPU's context priority to the lock holder's CPU,
/// following the PI chain transitively (A→B→C).
///
/// Reads priority from PercpuBlock::current_prio (cached by the scheduler)
/// to avoid acquiring any lock in the MCS spin loop.
///
/// Chain following: if the holder is itself waiting on another lock,
/// we propagate our priority to that lock's holder too, up to
/// MAX_PI_CHAIN_DEPTH hops.
fn maybe_donate_priority(&self, my_percpu: &PercpuBlock) {
let my_prio = my_percpu.current_prio.get() as u32;
let mut current_holder_cpu = self.holder_cpu.load(Ordering::Relaxed);
for _ in 0..MAX_PI_CHAIN_DEPTH {
if current_holder_cpu == u32::MAX {
return;
}
let holder_percpu = crate::percpu::get_for_cpu(
crate::cpu_set::LogicalCpuId::new(current_holder_cpu),
);
let Some(holder) = holder_percpu else {
return;
};
// Donate if our priority is higher (lower number) than current donation
let current_donated = holder.pi_donated_prio.load(Ordering::Relaxed);
if my_prio < current_donated {
holder.pi_donated_prio.store(my_prio, Ordering::Release);
}
// Follow the chain: is this holder also waiting on another lock?
let next_lock_ptr = holder.waiting_on_lock.load(Ordering::Relaxed);
if next_lock_ptr.is_null() {
return;
}
// SAFETY: The pointed-to McsRawLock is a long-lived struct field
// (e.g., part of the run queue). The holder is currently spinning
// in acquire(), so the pointer is valid. We only read holder_cpu
// (an atomic u32) — no mutable access needed.
let next_holder_cpu =
unsafe { (*next_lock_ptr).holder_cpu.load(Ordering::Relaxed) };
// Cycle detection: if the next holder is the same CPU we just visited, stop
if next_holder_cpu == current_holder_cpu {
return;
}
current_holder_cpu = next_holder_cpu;
}
// Chain depth exhausted — stop to bound latency
}
}
@@ -1,5 +1,6 @@
pub use self::{ordered::*, wait_condition::WaitCondition, wait_queue::WaitQueue};
pub mod mcs;
pub mod ordered;
pub mod wait_condition;
pub mod wait_queue;
@@ -52,7 +52,9 @@
//! *g1 = 12;
//! ```
use alloc::sync::Arc;
use core::cell::UnsafeCell;
use core::marker::PhantomData;
use core::ptr;
use crate::percpu::PercpuBlock;
@@ -732,3 +734,143 @@ impl<L: Level, T> Drop for ArcRwLockWriteGuard<L, T> {
/// This function can only be called if no lock is held by the calling thread/task
#[inline]
pub fn check_no_locks(_: LockToken<'_, L0>) {}
// ---------------------------------------------------------------------------
// MCS-based fair mutex (McsMutex)
// ---------------------------------------------------------------------------
/// A mutual exclusion lock using the MCS fair spinlock algorithm.
///
/// Unlike `Mutex<L, T>` which uses a simple spinlock (no fairness under
/// contention), `McsMutex` uses Mellor-Crummey Scott queue-based spinning:
///
/// - Each waiter spins on its **own** local flag — no shared cache-line bouncing.
/// - FIFO ordering prevents starvation.
/// - O(1) cache-line transfers on unlock.
///
/// The MCS node is stored in [`crate::percpu::PercpuBlock::mcs_sched_node`], so
/// this type is suitable for scheduler-internal locks where the holder is always
/// the current CPU.
pub struct McsMutex<L: Level, T> {
raw: crate::sync::mcs::McsRawLock,
data: UnsafeCell<T>,
_phantom: PhantomData<L>,
}
unsafe impl<L: Level, T: Send> Sync for McsMutex<L, T> {}
unsafe impl<L: Level, T: Send> Send for McsMutex<L, T> {}
impl<L: Level, T> McsMutex<L, T> {
pub const fn new(val: T) -> Self {
Self {
raw: crate::sync::mcs::McsRawLock::new(),
data: UnsafeCell::new(val),
_phantom: PhantomData,
}
}
}
impl<L: Level, T> McsMutex<L, T> {
pub fn lock<'a, LP: Lower<L> + 'a>(
&'a self,
lock_token: LockToken<'a, LP>,
) -> McsMutexGuard<'a, L, T> {
let percpu = PercpuBlock::current();
let contended = self.raw.acquire(&percpu.mcs_sched_node);
if contended {
percpu
.mcs_contention_count
.set(percpu.mcs_contention_count.get() + 1);
}
McsMutexGuard {
lock: self,
lock_token: LockToken::downgraded(lock_token),
}
}
pub fn try_lock<'a, LP: Lower<L> + 'a>(
&'a self,
lock_token: LockToken<'a, LP>,
) -> Option<McsMutexGuard<'a, L, T>> {
let percpu = PercpuBlock::current();
if self.raw.try_acquire(&percpu.mcs_sched_node) {
Some(McsMutexGuard {
lock: self,
lock_token: LockToken::downgraded(lock_token),
})
} else {
None
}
}
}
pub struct McsMutexGuard<'a, L: Level, T: 'a> {
lock: &'a McsMutex<L, T>,
lock_token: LockToken<'a, L>,
}
impl<'a, L: Level, T: 'a> McsMutexGuard<'a, L, T> {
pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) {
unsafe { (&mut *self.lock.data.get(), self.lock_token.token()) }
}
pub fn into_split(self) -> (McsRawGuard<'a, L, T>, LockToken<'a, L>) {
let lock_ref = self.lock;
let token = unsafe { core::ptr::read(&self.lock_token) };
core::mem::forget(self);
(McsRawGuard { lock: lock_ref }, token)
}
pub fn from_split(raw: McsRawGuard<'a, L, T>, token: LockToken<'a, L>) -> Self {
let lock_ref = raw.lock;
core::mem::forget(raw);
Self {
lock: lock_ref,
lock_token: token,
}
}
}
impl<L: Level, T> core::ops::Deref for McsMutexGuard<'_, L, T> {
type Target = T;
fn deref(&self) -> &Self::Target {
unsafe { &*self.lock.data.get() }
}
}
impl<L: Level, T> core::ops::DerefMut for McsMutexGuard<'_, L, T> {
fn deref_mut(&mut self) -> &mut Self::Target {
unsafe { &mut *self.lock.data.get() }
}
}
impl<L: Level, T> Drop for McsMutexGuard<'_, L, T> {
fn drop(&mut self) {
let percpu = PercpuBlock::current();
self.lock.raw.release(&percpu.mcs_sched_node);
}
}
pub struct McsRawGuard<'a, L: Level, T: 'a> {
lock: &'a McsMutex<L, T>,
}
impl<L: Level, T> core::ops::Deref for McsRawGuard<'_, L, T> {
type Target = T;
fn deref(&self) -> &Self::Target {
unsafe { &*self.lock.data.get() }
}
}
impl<L: Level, T> core::ops::DerefMut for McsRawGuard<'_, L, T> {
fn deref_mut(&mut self) -> &mut Self::Target {
unsafe { &mut *self.lock.data.get() }
}
}
impl<L: Level, T> Drop for McsRawGuard<'_, L, T> {
fn drop(&mut self) {
let percpu = PercpuBlock::current();
self.lock.raw.release(&percpu.mcs_sched_node);
}
}
+180 -102
View File
@@ -2,7 +2,7 @@
use core::num::NonZeroUsize;
use alloc::{string::String, sync::Arc, vec::Vec};
use alloc::{format, string::{String, ToString}, sync::Arc, vec::Vec};
use redox_path::RedoxPath;
use crate::{
@@ -12,9 +12,9 @@ use crate::{
memory::{AddrSpace, GenericFlusher, Grant, PageSpan, TlbShootdownActions},
},
memory::{Page, VirtualAddress, PAGE_SIZE},
scheme::{self, FileHandle, KernelScheme, OpenResult, StrOrBytes},
scheme::{self, pipe, FileHandle, KernelScheme, OpenResult, SchemeExt, StrOrBytes},
sync::{CleanLockToken, RwLock},
syscall::{data::Stat, error::*, flag::*},
syscall::{data::{GlobalSchemes, Stat}, error::*, flag::*},
};
use super::usercopy::{UserSlice, UserSliceRo, UserSliceRw, UserSliceWo};
@@ -45,7 +45,7 @@ pub fn file_op_generic_ext<T>(
(file, desc)
};
let scheme = scheme::get_scheme(token.token(), desc.scheme)?;
let scheme = desc.get_scheme(token)?;
op(&*scheme, file.description, desc, token)
}
@@ -62,55 +62,32 @@ pub fn copy_path_to_buf(raw_path: UserSliceRo, max_len: usize) -> Result<String>
// TODO: Define elsewhere
const PATH_MAX: usize = PAGE_SIZE;
pub fn openat(
fh: FileHandle,
raw_path: UserSliceRo,
fn fifo_path_key(scheme_id: scheme::SchemeId, number: usize, path: &str) -> String {
if path.starts_with('/') {
path.to_string()
} else {
format!("@fifo:{}:{}:{}", scheme_id.get(), number, path)
}
}
fn install_open_result(
scheme_id: scheme::SchemeId,
flags: usize,
fcntl_flags: u32,
euid: u32,
egid: u32,
open_result: OpenResult,
token: &mut CleanLockToken,
) -> Result<FileHandle> {
let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;
let (scheme_id, number) = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut token) = current.token_split();
let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?;
let desc = pipe.description.read(token.token());
(desc.scheme, desc.number)
};
let caller_ctx = context::current()
.read(token.token())
.caller_ctx()
.filter_uid_gid(euid, egid);
let new_description = {
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
let res = scheme.kopenat(
number,
StrOrBytes::from_str(&path_buf),
flags,
fcntl_flags,
caller_ctx,
token,
);
match res? {
OpenResult::SchemeLocal(number, internal_flags) => {
Arc::new(RwLock::new(FileDescription {
offset: 0,
internal_flags,
scheme: scheme_id,
number,
flags: (flags & !O_CLOEXEC) as u32,
}))
}
OpenResult::External(desc) => desc,
}
let new_description = match open_result {
OpenResult::SchemeLocal(number, internal_flags) => Arc::new(RwLock::new(
FileDescription::new(
scheme_id,
number,
0,
(flags & !O_CLOEXEC) as u32,
internal_flags,
token,
),
)),
OpenResult::External(desc) => desc,
};
let current_lock = context::current();
@@ -126,6 +103,102 @@ pub fn openat(
)
.ok_or(Error::new(EMFILE))
}
fn path_exists_in_scheme(
scheme: &dyn KernelScheme,
number: usize,
path: &str,
caller_ctx: scheme::CallerCtx,
token: &mut CleanLockToken,
) -> Result<bool> {
match scheme.kopenat(number, StrOrBytes::from_str(path), O_STAT, 0, caller_ctx, token) {
Ok(OpenResult::SchemeLocal(number, _)) => {
let _ = scheme.close(number, token);
Ok(true)
}
Ok(OpenResult::External(_)) => Ok(true),
Err(err) if err.errno == ENOENT => Ok(false),
Err(err) => Err(err),
}
}
pub fn openat(
fh: FileHandle,
raw_path: UserSliceRo,
flags: usize,
fcntl_flags: u32,
euid: u32,
egid: u32,
token: &mut CleanLockToken,
) -> Result<FileHandle> {
let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;
let desc = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut context_token) = current.token_split();
let pipe = context
.get_file(fh, &mut context_token)
.ok_or(Error::new(EBADF))?;
*pipe.description.read(context_token.token())
};
let scheme = desc.get_scheme(token)?;
let number = desc.number;
let scheme_id = desc.scheme;
let caller_ctx = context::current()
.read(token.token())
.caller_ctx()
.filter_uid_gid(euid, egid);
let fifo_mode_requested = flags & MODE_FIFO as usize == MODE_FIFO as usize;
let fifo_key = fifo_path_key(scheme_id, number, &path_buf);
if pipe::named_pipe_exists(&fifo_key, token) {
if flags & O_EXCL == O_EXCL && flags & O_CREAT == O_CREAT {
return Err(Error::new(EEXIST));
}
if fifo_mode_requested && flags & O_CREAT == O_CREAT {
return Err(Error::new(EEXIST));
}
let pipe_number = pipe::open_named_pipe(&fifo_key, flags, token)?
.ok_or(Error::new(ENOENT))?;
return install_open_result(
GlobalSchemes::Pipe.scheme_id(),
flags,
OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()),
token,
);
}
if fifo_mode_requested && flags & O_CREAT == O_CREAT {
if path_exists_in_scheme(&*scheme, number, &path_buf, caller_ctx, token)? {
return Err(Error::new(EEXIST));
}
let mode = u16::try_from(flags & 0o7777).map_err(|_| Error::new(EINVAL))?;
let pipe_number = pipe::create_named_pipe(&fifo_key, &path_buf, mode, flags, token)?;
return install_open_result(
GlobalSchemes::Pipe.scheme_id(),
flags,
OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()),
token,
);
}
let open_result = scheme.kopenat(
number,
StrOrBytes::from_str(&path_buf),
flags,
fcntl_flags,
caller_ctx,
token,
)?;
install_open_result(scheme_id, flags, open_result, token)
}
/// Unlinkat syscall
pub fn unlinkat(
fh: FileHandle,
@@ -137,22 +210,27 @@ pub fn unlinkat(
) -> Result<()> {
let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;
let (number, scheme_id) = {
let desc = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut token) = current.token_split();
let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?;
let desc = pipe.description.read(token.token());
(desc.number, desc.scheme)
let (context, mut context_token) = current.token_split();
let pipe = context
.get_file(fh, &mut context_token)
.ok_or(Error::new(EBADF))?;
*pipe.description.read(context_token.token())
};
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
let number = desc.number;
let scheme = desc.get_scheme(token)?;
let caller_ctx = context::current()
.read(token.token())
.caller_ctx()
.filter_uid_gid(euid, egid);
if pipe::unlink_named_pipe(&fifo_path_key(desc.scheme, number, &path_buf), token) {
return Ok(());
}
/*
let mut path_buf = BorrowedHtBuf::head()?;
let path = path_buf.use_for_string(raw_path)?;
@@ -199,17 +277,18 @@ fn duplicate_file(
let description = { *file.description.read(token.token()) };
let new_description = {
let scheme = scheme::get_scheme(token.token(), description.scheme)?;
let scheme = description.get_scheme(token)?;
match scheme.kdup(description.number, user_buf, caller_ctx, token)? {
OpenResult::SchemeLocal(number, internal_flags) => {
Arc::new(RwLock::new(FileDescription {
offset: 0,
internal_flags,
scheme: description.scheme,
Arc::new(RwLock::new(FileDescription::new(
description.scheme,
number,
flags: description.flags,
}))
0,
description.flags,
internal_flags,
token,
)))
}
OpenResult::External(desc) => desc,
}
@@ -296,11 +375,10 @@ fn call_normal(
}
.ok_or(Error::new(EBADF))?;
let (scheme_id, number) = {
let desc = file.description.read(token.token());
(desc.scheme, desc.number)
let (scheme, number) = {
let desc = *file.description.read(token.token());
(desc.get_scheme(token)?, desc.number)
};
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
if flags.contains(CallFlags::STD_FS) {
scheme.translate_std_fs_call(number, file.description, payload, flags, metadata, token)
@@ -341,28 +419,28 @@ fn fdwrite_inner(
) -> Result<usize> {
// TODO: Ensure deadlocks can't happen
let (scheme, number, descs_to_send) = {
let (scheme, number) = {
let desc = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut token) = current.token_split();
let (context, mut context_token) = current.token_split();
let file_descriptor = context
.get_file(socket, &mut token)
.get_file(socket, &mut context_token)
.ok_or(Error::new(EBADF))?;
let desc = &file_descriptor.description.read(token.token());
(desc.scheme, desc.number)
*file_descriptor.description.read(context_token.token())
};
let scheme = scheme::get_scheme(token.token(), scheme)?;
let scheme = desc.get_scheme(token)?;
let number = desc.number;
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut token) = current.token_split();
let (context, mut context_token) = current.token_split();
(
scheme,
number,
if flags.contains(CallFlags::FD_CLONE) {
context.bulk_get_files(&target_fds, &mut token)
context.bulk_get_files(&target_fds, &mut context_token)
} else {
context.bulk_remove_files(&target_fds, &mut token)
context.bulk_remove_files(&target_fds, &mut context_token)
}?
.into_iter()
.map(|f| f.description)
@@ -395,18 +473,22 @@ fn call_fdread(
metadata: &[u64],
token: &mut CleanLockToken,
) -> Result<usize> {
let desc = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut context_token) = current.token_split();
let file_descriptor = context
.get_file(fd, &mut context_token)
.ok_or(Error::new(EBADF))?;
*file_descriptor.description.read(context_token.token())
};
let (scheme, number) = {
let (scheme, number) = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut token) = current.token_split();
let file_descriptor = context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?;
let desc = file_descriptor.description.read(token.token());
(desc.scheme, desc.number)
};
let scheme = scheme::get_scheme(token.token(), scheme)?;
(scheme, number)
let scheme = desc.get_scheme(token)?;
let number = desc.number;
(
scheme,
number,
)
};
scheme.kfdread(number, payload, flags, metadata, token)
@@ -440,9 +522,9 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken)
}
.ok_or(Error::new(EBADF))?;
let (scheme_id, number, flags) = {
let desc = file.description.write(token.token());
(desc.scheme, desc.number, desc.flags)
let (number, flags, desc) = {
let desc = *file.description.read(token.token());
(desc.number, desc.flags, desc)
};
if cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC {
@@ -460,7 +542,7 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken)
// Communicate fcntl with scheme
if cmd != F_GETFD && cmd != F_SETFD {
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
let scheme = desc.get_scheme(token)?;
scheme.fcntl(number, cmd, arg, token)?;
};
@@ -518,13 +600,11 @@ pub fn flink(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken)
let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?;
let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?;
let (number, scheme_id) = {
let desc = file.description.read(token.token());
(desc.number, desc.scheme)
let (number, scheme) = {
let desc = *file.description.read(token.token());
(desc.number, desc.get_scheme(token)?)
};
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
// TODO: Check EXDEV.
/*
if scheme_id != description.scheme {
@@ -554,13 +634,11 @@ pub fn frename(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken
let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?;
let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?;
let (number, scheme_id) = {
let desc = file.description.read(token.token());
(desc.number, desc.scheme)
let (number, scheme) = {
let desc = *file.description.read(token.token());
(desc.number, desc.get_scheme(token)?)
};
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
// TODO: Check EXDEV.
/*
if scheme_id != description.scheme {
@@ -28,6 +28,11 @@ use crate::{
sync::CleanLockToken,
};
/// Local syscall numbers not yet in the redox_syscall crate.
/// These are allocated from the 987+ range to avoid collisions with crate numbers.
pub const SYS_SCHED_SETAFFINITY: usize = 987;
pub const SYS_SCHED_GETAFFINITY: usize = 988;
/// Debug
pub mod debug;
@@ -220,6 +225,10 @@ pub fn syscall(
unlinkat(fd, UserSlice::ro(c, d)?, e, f as _, g as _, token).map(|()| 0)
}
SYS_YIELD => sched_yield(token).map(|()| 0),
// P17-3: CPU affinity syscalls. Numbers allocated locally (not yet in redox_syscall crate).
SYS_SCHED_SETAFFINITY => sched_setaffinity(b, UserSlice::ro(c, d)?, token),
SYS_SCHED_GETAFFINITY => sched_getaffinity(b, UserSlice::wo(c, d)?, token),
SYS_NANOSLEEP => nanosleep(
UserSlice::ro(b, size_of::<TimeSpec>())?,
UserSlice::wo(c, size_of::<TimeSpec>())?.none_if_null(),
@@ -11,6 +11,7 @@ use crate::{
memory::{AddrSpace, Grant, PageSpan},
ContextRef,
},
cpu_set::RawMask,
event,
sync::{CleanLockToken, RwLock},
syscall::flag::{EventFlags, O_CREAT, O_RDWR},
@@ -271,24 +272,95 @@ unsafe fn bootstrap_mem(bootstrap: &crate::startup::Bootstrap) -> &'static [u8]
}
fn insert_fd(scheme: SchemeId, number: usize, cloexec: bool, token: &mut CleanLockToken) -> usize {
let description = Arc::new(RwLock::new(FileDescription::new(
scheme,
number,
0,
(O_CREAT | O_RDWR) as u32,
InternalFlags::empty(),
token,
)));
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut token) = current.token_split();
let (context, mut context_token) = current.token_split();
context
.add_file_min(
FileDescriptor {
description: Arc::new(RwLock::new(FileDescription {
scheme,
number,
offset: 0,
flags: (O_CREAT | O_RDWR) as u32,
internal_flags: InternalFlags::empty(),
})),
description,
cloexec,
},
syscall::flag::UPPER_FDTBL_TAG + scheme.get(),
&mut token,
&mut context_token,
)
.expect("failed to insert fd to current context")
.get()
}
/// Set CPU affinity mask for a process.
///
/// # Arguments (syscall ABI)
/// - `pid`: Process ID (0 = current process; other PIDs not yet supported)
/// - `mask_ptr`: Pointer to a `RawMask` (32 bytes on 64-bit, 256-bit bitmap)
/// - `mask_len`: Length of mask in bytes (must equal `size_of::<RawMask>()`)
pub fn sched_setaffinity(
pid: usize,
mask_ptr: super::usercopy::UserSliceRo,
token: &mut CleanLockToken,
) -> Result<usize> {
// Validate mask size
if mask_ptr.len() != core::mem::size_of::<RawMask>() {
return Err(Error::new(super::error::EINVAL));
}
// pid == 0 means current process
let target = if pid == 0 {
context::current()
} else {
// TODO: Support PID-based lookup (requires context list iteration
// with lock token downgrades). For now, only pid=0 is supported.
return Err(Error::new(super::error::ESRCH));
};
// Read mask from userspace
let raw_mask: RawMask = unsafe { mask_ptr.read_exact() }?;
// Apply to context's affinity mask
let mut ctx = target.write(token.token());
ctx.sched_affinity.override_from(&raw_mask);
Ok(0)
}
/// Get CPU affinity mask for a process.
///
/// # Arguments (syscall ABI)
/// - `pid`: Process ID (0 = current process; other PIDs not yet supported)
/// - `mask_ptr`: Pointer to a `RawMask` buffer (32 bytes on 64-bit)
/// - `mask_len`: Length of buffer in bytes (must equal `size_of::<RawMask>()`)
///
/// # Returns
/// Number of bytes written to mask_ptr on success.
pub fn sched_getaffinity(
pid: usize,
mask_ptr: super::usercopy::UserSliceWo,
token: &mut CleanLockToken,
) -> Result<usize> {
// Validate mask size
if mask_ptr.len() != core::mem::size_of::<RawMask>() {
return Err(Error::new(super::error::EINVAL));
}
// pid == 0 means current process
let target = if pid == 0 {
context::current()
} else {
return Err(Error::new(super::error::ESRCH));
};
let ctx = target.read(token.token());
let raw_mask = ctx.sched_affinity.to_raw();
mask_ptr.copy_common_bytes_from_slice(crate::cpu_set::mask_as_bytes(&raw_mask))?;
Ok(core::mem::size_of::<RawMask>())
}
@@ -0,0 +1,112 @@
#####################################################
# #
# THIS FILE IS GENERATED, DO NOT EDIT! #
# #
# Generated with "ci-fairy generate-template", edit #
# .gitlab-ci/ci.template and .gitlab-ci/config.yml #
# and rerun "ci-fairy generate-template" to change #
# this file. #
# #
#####################################################
.templates_sha: &template_sha 3d03cccd770c04e63b40325b42223495274d6a1d
include:
- project: 'freedesktop/ci-templates'
ref: *template_sha
file:
- '/templates/ci-fairy.yml'
- '/templates/fedora.yml'
- template: Security/SAST.gitlab-ci.yml
stages:
- sanity check
- prep
- build
- test
variables:
FDO_UPSTREAM_REPO: xorg/lib/libxcvt
MESON_BUILDDIR: "builddir"
NINJA_ARGS: ''
MESON_ARGS: ''
MESON_TEST_ARGS: ''
GIT_DEPTH: 1
.policy:
retry:
max: 2
when:
- runner_system_failure
- stuck_or_timeout_failure
# cancel run when a newer version is pushed to the branch
interruptible: true
# Re-generate the CI script and make sure it's the one currently checked in
# If this job fails, re-generate the gitlab-ci.yml script, see
# $SRCDIR/.gitlab-ci/generate-gitlab-ci.py
#
check-ci-script:
extends:
- .fdo.ci-fairy
stage: sanity check
script:
- ci-fairy generate-template --verify && exit 0 || true
- echo "Committed gitlab-ci.yml differs from generated gitlab-ci.yml. Please verify"
- exit 1
#
# Verify that commit messages are as expected, signed-off, etc.
#
check-commit:
extends:
- .fdo.ci-fairy
stage: sanity check
script:
- ci-fairy check-commits --signed-off-by --junit-xml=results.xml
except:
- master@xorg/lib/libxcvt
variables:
GIT_DEPTH: 100
artifacts:
reports:
junit: results.xml
#
# Verify that merge request has the "allow collaboration" checkbox ticked
#
check-merge-request:
extends:
- .fdo.ci-fairy
stage: sanity check
script:
- ci-fairy check-merge-request --require-allow-collaboration --junit-xml=results.xml
artifacts:
when: on_failure
reports:
junit: results.xml
allow_failure: true
.fedora.34:
variables:
FDO_DISTRIBUTION_VERSION: '34'
FDO_DISTRIBUTION_TAG: '2022-08-03.0'
prep-fedora-34:
extends:
- .fdo.container-build@fedora
- .fedora.34
stage: prep
variables:
FDO_DISTRIBUTION_PACKAGES: "meson gcc"
build-fedora-34:
extends:
- .fdo.distribution-image@fedora
- .fedora.34
stage: build
script:
- .gitlab-ci/meson-build.sh
@@ -0,0 +1,118 @@
{# You're looking at the template here, so you can ignore the below
warning. This is the right file to edit #}
#####################################################
# #
# THIS FILE IS GENERATED, DO NOT EDIT! #
# #
# Generated with "ci-fairy generate-template", edit #
# .gitlab-ci/ci.template and .gitlab-ci/config.yml #
# and rerun "ci-fairy generate-template" to change #
# this file. #
# #
#####################################################
.templates_sha: &template_sha 3d03cccd770c04e63b40325b42223495274d6a1d
include:
- project: 'freedesktop/ci-templates'
ref: *template_sha
file:
- '/templates/ci-fairy.yml'
{% for d in distributions %}
- '/templates/{{d.name}}.yml'
{% endfor %}
- template: Security/SAST.gitlab-ci.yml
stages:
- sanity check
- prep
- build
- test
variables:
FDO_UPSTREAM_REPO: xorg/lib/libxcvt
MESON_BUILDDIR: "builddir"
NINJA_ARGS: ''
MESON_ARGS: ''
MESON_TEST_ARGS: ''
GIT_DEPTH: 1
.policy:
retry:
max: 2
when:
- runner_system_failure
- stuck_or_timeout_failure
# cancel run when a newer version is pushed to the branch
interruptible: true
# Re-generate the CI script and make sure it's the one currently checked in
# If this job fails, re-generate the gitlab-ci.yml script, see
# $SRCDIR/.gitlab-ci/generate-gitlab-ci.py
#
check-ci-script:
extends:
- .fdo.ci-fairy
stage: sanity check
script:
- ci-fairy generate-template --verify && exit 0 || true
- echo "Committed gitlab-ci.yml differs from generated gitlab-ci.yml. Please verify"
- exit 1
#
# Verify that commit messages are as expected, signed-off, etc.
#
check-commit:
extends:
- .fdo.ci-fairy
stage: sanity check
script:
- ci-fairy check-commits --signed-off-by --junit-xml=results.xml
except:
- master@xorg/lib/libxcvt
variables:
GIT_DEPTH: 100
artifacts:
reports:
junit: results.xml
#
# Verify that merge request has the "allow collaboration" checkbox ticked
#
check-merge-request:
extends:
- .fdo.ci-fairy
stage: sanity check
script:
- ci-fairy check-merge-request --require-allow-collaboration --junit-xml=results.xml
artifacts:
when: on_failure
reports:
junit: results.xml
allow_failure: true
{% for d in distributions %}
.{{d.name}}.{{d.version}}:
variables:
FDO_DISTRIBUTION_VERSION: '{{d.version}}'
FDO_DISTRIBUTION_TAG: '{{d.tag}}'
prep-{{d.name}}-{{d.version}}:
extends:
- .fdo.container-build@{{d.name}}
- .{{d.name}}.{{d.version}}
stage: prep
variables:
FDO_DISTRIBUTION_PACKAGES: "{{' '.join(d.packages)}}"
build-{{d.name}}-{{d.version}}:
extends:
- .fdo.distribution-image@{{d.name}}
- .{{d.name}}.{{d.version}}
stage: build
script:
- .gitlab-ci/meson-build.sh
{% endfor %}
@@ -0,0 +1,9 @@
.default_tag: &default_tag '2022-08-03.0'
distributions:
- name: fedora
tag: *default_tag
version: 34
packages:
- meson
- gcc
+48
View File
@@ -0,0 +1,48 @@
#!/bin/bash
if [[ -f .meson_environment ]]; then
. .meson_environment
fi
if [[ -z "$MESON_BUILDDIR" ]]; then
echo "\$MESON_BUILDDIR undefined."
exit 1
fi
# emulate a few gitlab variables to make it easier to
# run and debug locally.
if [[ -z "$CI_JOB_ID" ]] || [[ -z "$CI_JOB_NAME" ]] || [[ -z "$CI_PROJECT_NAME" ]]; then
echo "Missing \$CI_JOB_ID or \$CI_JOB_NAME".
CI_PROJECT_NAME=$(basename $PWD)
CI_JOB_ID=$(date +%s)
CI_JOB_NAME='${CI_PROJECT_NAME}-job-local'
echo "Simulating gitlab environment: "
echo " CI_JOB_ID=$CI_JOB_ID"
echo " CI_JOB_NAME=$CI_JOB_NAME"
echo " CI_PROJECT_NAME=$CI_PROJECT_NAME"
fi
echo "*************************************************"
echo "builddir: $MESON_BUILDDIR"
echo "meson args: $MESON_ARGS"
echo "ninja args: $NINJA_ARGS"
echo "meson test args: $MESON_TEST_ARGS"
echo "*************************************************"
set -e
rm -rf "$MESON_BUILDDIR"
meson "$MESON_BUILDDIR" $MESON_ARGS
meson configure "$MESON_BUILDDIR"
ninja -C "$MESON_BUILDDIR" $NINJA_ARGS
if [[ -z "$MESON_TEST_ARGS" ]]; then
exit 0
fi
# we still want to generate the reports, even if meson test fails
meson test -C "$MESON_BUILDDIR" $MESON_TEST_ARGS --print-errorlogs
exit_code=$?
exit $exit_code
+67
View File
@@ -0,0 +1,67 @@
Copyright 2005-2006 Luc Verhaegen.
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
Copyright 2005-2006 Luc Verhaegen.
Copyright © 2021 Red Hat, Inc.
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
Copyright © 2000 Compaq Computer Corporation
Copyright © 2002 Hewlett Packard Company
Copyright © 2006 Intel Corporation
Copyright © 2008, 2021 Red Hat, Inc.
Permission to use, copy, modify, distribute, and sell this software and its
documentation for any purpose is hereby granted without fee, provided that
the above copyright notice appear in all copies and that both that copyright
notice and this permission notice appear in supporting documentation, and
that the name of the copyright holders not be used in advertising or
publicity pertaining to distribution of the software without specific,
written prior permission. The copyright holders make no representations
about the suitability of this software for any purpose. It is provided "as
is" without express or implied warranty.
THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
OF THIS SOFTWARE.
+36
View File
@@ -0,0 +1,36 @@
libxcvt
=======
`libxcvt` is a library providing a standalone version of the X server
implementation of the VESA CVT standard timing modelines generator.
`libxcvt` also provides a standalone version of the command line tool
`cvt` copied from the Xorg implementation and is meant to be a direct
replacement to the version provided by the `Xorg` server.
An example output is:
```
$ cvt --verbose 1920 1200 75
# 1920x1200 74.93 Hz (CVT 2.30MA) hsync: 94.04 kHz; pclk: 245.25 MHz
Modeline "1920x1200_75.00" 245.25 1920 2064 2264 2608 1200 1203 1209 1255 -hsync +vsync
```
Building
========
`libxcvt` is built using [Meson](https://mesonbuild.com/)
$ git clone https://gitlab.freedesktop.org/xorg/lib/libxcvt.git
$ cd libxcvt
$ meson build/ --prefix=...
$ ninja -C build/ install
$ cd ..
Credit
======
The code base of `libxcvt` is identical to `xf86CVTMode()` therefore
all credits for `libxcvt` go to the author (Luc Verhaegen) and
contributors of `xf86CVTMode()` and the `cvt` utility as found in the
[xserver](https://gitlab.freedesktop.org/xorg/xserver/) repository.
+257
View File
@@ -0,0 +1,257 @@
/*
* Copyright 2005-2006 Luc Verhaegen.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
/* Standalone VESA CVT standard timing modelines generator. */
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <libxcvt/libxcvt.h>
static bool
cvt_is_standard(int hdisplay, int vdisplay, float vrefresh, bool reduced, bool verbose)
{
bool is_cvt = true;
if ((!(vdisplay % 3) && ((vdisplay * 4 / 3) == hdisplay)) ||
(!(vdisplay % 9) && ((vdisplay * 16 / 9) == hdisplay)) ||
(!(vdisplay % 10) && ((vdisplay * 16 / 10) == hdisplay)) ||
(!(vdisplay % 4) && ((vdisplay * 5 / 4) == hdisplay)) ||
(!(vdisplay % 9) && ((vdisplay * 15 / 9) == hdisplay)));
else {
if (verbose)
fprintf(stderr, "Warning: Aspect Ratio is not CVT standard.\n");
is_cvt = false;
}
if ((vrefresh != 50.0) && (vrefresh != 60.0) &&
(vrefresh != 75.0) && (vrefresh != 85.0)) {
if (verbose)
fprintf(stderr, "Warning: Refresh Rate %.2f is not CVT standard "
"(50, 60, 75 or 85Hz).\n", vrefresh);
is_cvt = false;
}
return is_cvt;
}
/*
* I'm not documenting --interlaced for obvious reasons, even though I did
* implement it. I also can't deny having looked at gtf here.
*/
static void
print_usage(char *Name)
{
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [-v|--verbose] [-r|--reduced] X Y [refresh]\n",
Name);
fprintf(stderr, "\n");
fprintf(stderr, " -v|--verbose : Warn about CVT standard adherence.\n");
fprintf(stderr, " -r|--reduced : Create a mode with reduced blanking "
"(default: normal blanking).\n");
fprintf(stderr, " X : Desired horizontal resolution "
"(multiple of 8, required).\n");
fprintf(stderr,
" Y : Desired vertical resolution (required).\n");
fprintf(stderr,
" refresh : Desired refresh rate (default: 60.0Hz).\n");
fprintf(stderr, "\n");
fprintf(stderr, "Calculates VESA CVT (Coordinated Video Timing) modelines"
" for use with X.\n");
}
/*
*
*/
static void
print_comment(struct libxcvt_mode_info *mode_info, bool is_cvt, bool reduced)
{
printf("# %dx%d %.2f Hz ", mode_info->hdisplay, mode_info->vdisplay, mode_info->vrefresh);
if (is_cvt) {
printf("(CVT %.2fM",
((float) mode_info->hdisplay * mode_info->vdisplay) / 1000000.0);
if (!(mode_info->vdisplay % 3) &&
((mode_info->vdisplay * 4 / 3) == mode_info->hdisplay))
printf("3");
else if (!(mode_info->vdisplay % 9) &&
((mode_info->vdisplay * 16 / 9) == mode_info->hdisplay))
printf("9");
else if (!(mode_info->vdisplay % 10) &&
((mode_info->vdisplay * 16 / 10) == mode_info->hdisplay))
printf("A");
else if (!(mode_info->vdisplay % 4) &&
((mode_info->vdisplay * 5 / 4) == mode_info->hdisplay))
printf("4");
else if (!(mode_info->vdisplay % 9) &&
((mode_info->vdisplay * 15 / 9) == mode_info->hdisplay))
printf("9");
if (reduced)
printf("-R");
printf(") ");
}
else
printf("(CVT) ");
printf("hsync: %.2f kHz; ", mode_info->hsync);
printf("pclk: %.2f MHz", ((float) mode_info->dot_clock) / 1000.0);
printf("\n");
}
/*
* Originally grabbed from xf86Mode.c.
*
* Ignoring the actual mode_info->name, as the user will want something solid
* to grab hold of.
*/
static void
print_mode_line(struct libxcvt_mode_info *mode_info, int hdisplay, int vdisplay, float vrefresh,
bool reduced)
{
if (reduced)
printf("Modeline \"%dx%dR\" ", hdisplay, vdisplay);
else
printf("Modeline \"%dx%d_%.2f\" ", hdisplay, vdisplay, vrefresh);
printf("%6.2f %i %i %i %i %i %i %i %i", mode_info->dot_clock / 1000.,
mode_info->hdisplay, mode_info->hsync_start, mode_info->hsync_end, mode_info->htotal,
mode_info->vdisplay, mode_info->vsync_start, mode_info->vsync_end, mode_info->vtotal);
if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_INTERLACE)
printf(" interlace");
if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_HSYNC_POSITIVE)
printf(" +hsync");
if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_HSYNC_NEGATIVE)
printf(" -hsync");
if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_VSYNC_POSITIVE)
printf(" +vsync");
if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_VSYNC_NEGATIVE)
printf(" -vsync");
printf("\n");
}
/*
*
*/
int
main(int argc, char *argv[])
{
struct libxcvt_mode_info *mode_info;
int hdisplay = 0, vdisplay = 0;
float vrefresh = 0.0;
bool reduced = false, verbose = false, is_cvt;
bool interlaced = false;
int n;
if ((argc < 3) || (argc > 7)) {
print_usage(argv[0]);
return 1;
}
/* This doesn't filter out bad flags properly. Bad flags get passed down
* to atoi/atof, which then return 0, so that these variables can get
* filled next time round. So this is just a cosmetic problem.
*/
for (n = 1; n < argc; n++) {
if (!strcmp(argv[n], "-r") || !strcmp(argv[n], "--reduced"))
reduced = true;
else if (!strcmp(argv[n], "-i") || !strcmp(argv[n], "--interlaced"))
interlaced = true;
else if (!strcmp(argv[n], "-v") || !strcmp(argv[n], "--verbose"))
verbose = true;
else if (!strcmp(argv[n], "-h") || !strcmp(argv[n], "--help")) {
print_usage(argv[0]);
return 0;
}
else if (!hdisplay) {
hdisplay = atoi(argv[n]);
if (!hdisplay) {
print_usage(argv[0]);
return 1;
}
}
else if (!vdisplay) {
vdisplay = atoi(argv[n]);
if (!vdisplay) {
print_usage(argv[0]);
return 1;
}
}
else if (!vrefresh) {
vrefresh = atof(argv[n]);
if (!vrefresh) {
print_usage(argv[0]);
return 1;
}
}
else {
print_usage(argv[0]);
return 1;
}
}
if (!hdisplay || !vdisplay) {
print_usage(argv[0]);
return 0;
}
/* Default to 60.0Hz */
if (!vrefresh)
vrefresh = 60.0;
/* Horizontal timing is always a multiple of 8: round up. */
if (hdisplay & 0x07) {
hdisplay &= ~0x07;
hdisplay += 8;
}
if (reduced) {
if ((vrefresh / 60.0) != floor(vrefresh / 60.0)) {
fprintf(stderr,
"\nERROR: Multiple of 60Hz refresh rate required for "
" reduced blanking.\n");
print_usage(argv[0]);
return 0;
}
}
mode_info = libxcvt_gen_mode_info(hdisplay, vdisplay, vrefresh, reduced, interlaced);
if (!mode_info) {
fprintf(stderr, "Out of memory!\n");
return 0;
}
is_cvt = cvt_is_standard(hdisplay, vdisplay, vrefresh, reduced, verbose);
print_comment(mode_info, is_cvt, reduced);
print_mode_line(mode_info, hdisplay, vdisplay, vrefresh, reduced);
free(mode_info);
return 0;
}
@@ -0,0 +1,10 @@
cvt_src = [
'cvt.c',
]
executable('cvt',
cvt_src,
include_directories : inc,
link_with : libxcvt,
dependencies: mdep,
install : true)
@@ -0,0 +1,46 @@
/*
* Copyright 2005-2006 Luc Verhaegen.
* Copyright © 2021 Red Hat, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef _LIBCVT_H_
#define _LIBCVT_H_
#include <stdbool.h>
#include <libxcvt/libxcvt_mode.h>
#ifdef __cplusplus
extern "C" {
#endif
struct libxcvt_mode_info *
libxcvt_gen_mode_info(int hdisplay,
int vdisplay,
float vrefresh,
bool reduced,
bool interlaced);
#ifdef __cplusplus
}
#endif
#endif /* _LIBCVT_H_ */
@@ -0,0 +1,56 @@
/*
* Copyright © 2000 Compaq Computer Corporation
* Copyright © 2002 Hewlett Packard Company
* Copyright © 2006 Intel Corporation
* Copyright © 2008, 2021 Red Hat, Inc.
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that copyright
* notice and this permission notice appear in supporting documentation, and
* that the name of the copyright holders not be used in advertising or
* publicity pertaining to distribution of the software without specific,
* written prior permission. The copyright holders make no representations
* about the suitability of this software for any purpose. It is provided "as
* is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
* INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
* EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
* CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THIS SOFTWARE.
*
*/
#ifndef _LIBXCVT_MODE_H_
#define _LIBXCVT_MODE_H_
#include <stdint.h>
/* Conveniently chosen to match the RandR definitions */
enum libxcvt_mode_flags {
LIBXCVT_MODE_FLAG_HSYNC_POSITIVE = (1 << 0),
LIBXCVT_MODE_FLAG_HSYNC_NEGATIVE = (1 << 1),
LIBXCVT_MODE_FLAG_VSYNC_POSITIVE = (1 << 2),
LIBXCVT_MODE_FLAG_VSYNC_NEGATIVE = (1 << 3),
LIBXCVT_MODE_FLAG_INTERLACE = (1 << 4),
};
struct libxcvt_mode_info {
uint32_t hdisplay;
uint32_t vdisplay;
float vrefresh;
float hsync;
uint64_t dot_clock;
uint16_t hsync_start;
uint16_t hsync_end;
uint16_t htotal;
uint16_t vsync_start;
uint16_t vsync_end;
uint16_t vtotal;
enum libxcvt_mode_flags mode_flags;
};
#endif /* _LIBXCVT_MODE_H_ */
@@ -0,0 +1 @@
install_headers('libxcvt.h','libxcvt_mode.h', subdir: 'libxcvt')
@@ -0,0 +1 @@
subdir('libxcvt')
@@ -0,0 +1,301 @@
/*
* Copyright 2005-2006 Luc Verhaegen.
* Copyright © 2021 Red Hat, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
/* Standalone VESA CVT standard timing modelines generator. */
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <libxcvt/libxcvt.h>
/*
* Generate a CVT standard mode from hdisplay, vdisplay and vrefresh.
*
* These calculations are stolen from the CVT calculation spreadsheet written
* by Graham Loveridge. He seems to be claiming no copyright and there seems to
* be no license attached to this. He apparently just wants to see his name
* mentioned.
*
* This file can be found at http://www.vesa.org/Public/CVT/CVTd6r1.xls
*
* Comments and structure corresponds to the comments and structure of the xls.
* This should ease importing of future changes to the standard (not very
* likely though).
*
* About margins; i'm sure that they are to be the bit between HDisplay and
* HBlankStart, HBlankEnd and HTotal, VDisplay and VBlankStart, VBlankEnd and
* VTotal, where the overscan colour is shown. FB seems to call _all_ blanking
* outside sync "margin" for some reason. Since we prefer seeing proper
* blanking instead of the overscan colour, and since the Crtc* values will
* probably get altered after us, we will disable margins altogether. With
* these calculations, Margins will plainly expand H/VDisplay, and we don't
* want that. -- libv
*
*/
struct libxcvt_mode_info *
libxcvt_gen_mode_info(int hdisplay, int vdisplay, float vrefresh, bool reduced, bool interlaced)
{
bool margins = false;
float vfield_rate, hperiod;
int hdisplay_rnd, hmargin;
int vdisplay_rnd, vmargin, vsync;
float interlace; /* Please rename this */
struct libxcvt_mode_info *mode_info;
mode_info = calloc(1, sizeof *mode_info);
if (!mode_info)
return NULL;
mode_info->hdisplay = hdisplay;
mode_info->vdisplay = vdisplay;
mode_info->vrefresh = vrefresh;
/* 1) top/bottom margin size (% of height) - default: 1.8 */
#define CVT_MARGIN_PERCENTAGE 1.8
/* 2) character cell horizontal granularity (pixels) - default 8 */
#define CVT_H_GRANULARITY 8
/* 4) Minimum vertical front porch (lines) - default 3 */
#define CVT_MIN_V_PORCH_RND 3
/* 4) Minimum number of vertical back porch lines - default 6 */
#define CVT_MIN_V_BPORCH 6
/* Pixel Clock step (kHz) */
#define CVT_CLOCK_STEP 250
/* CVT default is 60.0Hz */
if (!mode_info->vrefresh)
mode_info->vrefresh = 60.0;
/* 1. Required field rate */
if (interlaced)
vfield_rate = mode_info->vrefresh * 2;
else
vfield_rate = mode_info->vrefresh;
/* 2. Horizontal pixels */
hdisplay_rnd = mode_info->hdisplay - (mode_info->hdisplay % CVT_H_GRANULARITY);
/* 3. Determine left and right borders */
if (margins) {
/* right margin is actually exactly the same as left */
hmargin = (((float) hdisplay_rnd) * CVT_MARGIN_PERCENTAGE / 100.0);
hmargin -= hmargin % CVT_H_GRANULARITY;
}
else {
hmargin = 0;
}
/* 4. Find total active pixels */
mode_info->hdisplay = hdisplay_rnd + 2 * hmargin;
/* 5. Find number of lines per field */
if (interlaced)
vdisplay_rnd = mode_info->vdisplay / 2;
else
vdisplay_rnd = mode_info->vdisplay;
/* 6. Find top and bottom margins */
/* nope. */
if (margins)
/* top and bottom margins are equal again. */
vmargin = (((float) vdisplay_rnd) * CVT_MARGIN_PERCENTAGE / 100.0);
else
vmargin = 0;
mode_info->vdisplay = mode_info->vdisplay + 2 * vmargin;
/* 7. interlace */
if (interlaced)
interlace = 0.5;
else
interlace = 0.0;
/* Determine vsync Width from aspect ratio */
if (!(mode_info->vdisplay % 3) && ((mode_info->vdisplay * 4 / 3) == mode_info->hdisplay))
vsync = 4;
else if (!(mode_info->vdisplay % 9) && ((mode_info->vdisplay * 16 / 9) == mode_info->hdisplay))
vsync = 5;
else if (!(mode_info->vdisplay % 10) && ((mode_info->vdisplay * 16 / 10) == mode_info->hdisplay))
vsync = 6;
else if (!(mode_info->vdisplay % 4) && ((mode_info->vdisplay * 5 / 4) == mode_info->hdisplay))
vsync = 7;
else if (!(mode_info->vdisplay % 9) && ((mode_info->vdisplay * 15 / 9) == mode_info->hdisplay))
vsync = 7;
else /* Custom */
vsync = 10;
if (!reduced) { /* simplified GTF calculation */
/* 4) Minimum time of vertical sync + back porch interval (µs)
* default 550.0 */
#define CVT_MIN_VSYNC_BP 550.0
/* 3) Nominal HSync width (% of line period) - default 8 */
#define CVT_HSYNC_PERCENTAGE 8
float hblank_percentage;
int vsync_and_back_porch, vback_porch;
int hblank, hsync_w;
/* 8. Estimated Horizontal period */
hperiod = ((float) (1000000.0 / vfield_rate - CVT_MIN_VSYNC_BP)) /
(vdisplay_rnd + 2 * vmargin + CVT_MIN_V_PORCH_RND + interlace);
/* 9. Find number of lines in sync + backporch */
if (((int) (CVT_MIN_VSYNC_BP / hperiod) + 1) <
(vsync + CVT_MIN_V_BPORCH))
vsync_and_back_porch = vsync + CVT_MIN_V_BPORCH;
else
vsync_and_back_porch = (int) (CVT_MIN_VSYNC_BP / hperiod) + 1;
/* 10. Find number of lines in back porch */
vback_porch = vsync_and_back_porch - vsync;
(void) vback_porch;
/* 11. Find total number of lines in vertical field */
mode_info->vtotal =
vdisplay_rnd + 2 * vmargin + vsync_and_back_porch + interlace +
CVT_MIN_V_PORCH_RND;
/* 5) Definition of Horizontal blanking time limitation */
/* Gradient (%/kHz) - default 600 */
#define CVT_M_FACTOR 600
/* Offset (%) - default 40 */
#define CVT_C_FACTOR 40
/* Blanking time scaling factor - default 128 */
#define CVT_K_FACTOR 128
/* Scaling factor weighting - default 20 */
#define CVT_J_FACTOR 20
#define CVT_M_PRIME CVT_M_FACTOR * CVT_K_FACTOR / 256
#define CVT_C_PRIME (CVT_C_FACTOR - CVT_J_FACTOR) * CVT_K_FACTOR / 256 + \
CVT_J_FACTOR
/* 12. Find ideal blanking duty cycle from formula */
hblank_percentage = CVT_C_PRIME - CVT_M_PRIME * hperiod / 1000.0;
/* 13. Blanking time */
if (hblank_percentage < 20)
hblank_percentage = 20;
hblank = mode_info->hdisplay * hblank_percentage / (100.0 - hblank_percentage);
hblank -= hblank % (2 * CVT_H_GRANULARITY);
/* 14. Find total number of pixels in a line. */
mode_info->htotal = mode_info->hdisplay + hblank;
/* Fill in HSync values */
mode_info->hsync_end = mode_info->hdisplay + hblank / 2;
hsync_w = (mode_info->htotal * CVT_HSYNC_PERCENTAGE) / 100;
hsync_w -= hsync_w % CVT_H_GRANULARITY;
mode_info->hsync_start = mode_info->hsync_end - hsync_w;
/* Fill in vsync values */
mode_info->vsync_start = mode_info->vdisplay + CVT_MIN_V_PORCH_RND;
mode_info->vsync_end = mode_info->vsync_start + vsync;
}
else { /* reduced blanking */
/* Minimum vertical blanking interval time (µs) - default 460 */
#define CVT_RB_MIN_VBLANK 460.0
/* Fixed number of clocks for horizontal sync */
#define CVT_RB_H_SYNC 32.0
/* Fixed number of clocks for horizontal blanking */
#define CVT_RB_H_BLANK 160.0
/* Fixed number of lines for vertical front porch - default 3 */
#define CVT_RB_VFPORCH 3
int vblank_interval_lines;
/* 8. Estimate Horizontal period. */
hperiod = ((float) (1000000.0 / vfield_rate - CVT_RB_MIN_VBLANK)) /
(vdisplay_rnd + 2 * vmargin);
/* 9. Find number of lines in vertical blanking */
vblank_interval_lines = ((float) CVT_RB_MIN_VBLANK) / hperiod + 1;
/* 10. Check if vertical blanking is sufficient */
if (vblank_interval_lines < (CVT_RB_VFPORCH + vsync + CVT_MIN_V_BPORCH))
vblank_interval_lines = CVT_RB_VFPORCH + vsync + CVT_MIN_V_BPORCH;
/* 11. Find total number of lines in vertical field */
mode_info->vtotal = vdisplay_rnd + 2 * vmargin + interlace + vblank_interval_lines;
/* 12. Find total number of pixels in a line */
mode_info->htotal = mode_info->hdisplay + CVT_RB_H_BLANK;
/* Fill in HSync values */
mode_info->hsync_end = mode_info->hdisplay + CVT_RB_H_BLANK / 2;
mode_info->hsync_start = mode_info->hsync_end - CVT_RB_H_SYNC;
/* Fill in vsync values */
mode_info->vsync_start = mode_info->vdisplay + CVT_RB_VFPORCH;
mode_info->vsync_end = mode_info->vsync_start + vsync;
}
/* 15/13. Find pixel clock frequency (kHz for xf86) */
mode_info->dot_clock = mode_info->htotal * 1000.0 / hperiod;
mode_info->dot_clock -= mode_info->dot_clock % CVT_CLOCK_STEP;
/* 16/14. Find actual Horizontal Frequency (kHz) */
mode_info->hsync = ((float) mode_info->dot_clock) / ((float) mode_info->htotal);
/* 17/15. Find actual Field rate */
mode_info->vrefresh = (1000.0 * ((float) mode_info->dot_clock)) /
((float) (mode_info->htotal * mode_info->vtotal));
/* 18/16. Find actual vertical frame frequency */
/* ignore - just set the mode flag for interlaced */
if (interlaced)
mode_info->vtotal *= 2;
if (reduced)
mode_info->mode_flags |= LIBXCVT_MODE_FLAG_HSYNC_POSITIVE | LIBXCVT_MODE_FLAG_VSYNC_NEGATIVE;
else
mode_info->mode_flags |= LIBXCVT_MODE_FLAG_HSYNC_NEGATIVE | LIBXCVT_MODE_FLAG_VSYNC_POSITIVE;
if (interlaced)
mode_info->mode_flags |= LIBXCVT_MODE_FLAG_INTERLACE;
/* FWXGA hack adapted from hw/xfree86/modes/xf86EdidModes.c, because you can't say 1366 */
if (mode_info->hdisplay == 1360 && mode_info->vdisplay == 768) {
mode_info->hdisplay = 1366;
mode_info->hsync_start--;
mode_info->hsync_end--;
}
return mode_info;
}
@@ -0,0 +1,7 @@
libxcvt_sources = ['libxcvt.c']
libxcvt = shared_library('xcvt',
libxcvt_sources,
include_directories : inc,
version: meson.project_version(),
darwin_versions : ['1.0.0', '1.0.0' ],
install : true)
@@ -0,0 +1,41 @@
.TH CVT 1 @vendorversion@
.SH NAME
cvt - calculate VESA CVT mode lines
.SH SYNOPSIS
.B cvt
.RB [ \-v | \-\-verbose ]
.RB [ \-r | \-\-reduced ]
.I h-resolution
.I v-resolution
.RB [ refresh ]
.SH DESCRIPTION
.I Cvt
is a utility for calculating VESA Coordinated Video Timing modes. Given the
desired horizontal and vertical resolutions, a modeline adhering to the CVT
standard is printed. This modeline can be included in Xorg
.B xorg.conf(@filemansuffix@)
.
.SH OPTIONS
.TP 8
.BR refresh
Provide a vertical refresh rate in Hz. The CVT standard prefers either 50.0,
60.0, 75.0 or 85.0Hz. The default is 60.0Hz.
.TP 8
.BR \-v | \-\-verbose
Warn verbosely when a given mode does not completely correspond with CVT
standards.
.TP 8
.BR \-r | \-\-reduced
Create a mode with reduced blanking. This allows for higher frequency signals,
with a lower or equal dotclock. Not for Cathode Ray Tube based displays though.
.SH "SEE ALSO"
xorg.conf(@filemansuffix@), gtf(@appmansuffix@)
.SH AUTHOR
Luc Verhaegen.
.PP
This program is based on the Coordinated Video Timing sample
implementation written by Graham Loveridge. This file is publicly
available at <http://www.vesa.org/Public/CVT/CVTd6r1.xls>. CVT is a
VESA trademark.
@@ -0,0 +1,12 @@
man_conf = configuration_data()
man_conf.set('appmansuffix', '1')
man_conf.set('filemansuffix', '5')
man_conf.set('vendorversion',
'"libxcvt @0@" "X Version 11"'.format(meson.project_version()))
configure_file(
input: 'cvt.man',
output: 'cvt.1',
install_dir: join_paths(man, 'man1'),
configuration: man_conf
)
@@ -0,0 +1,28 @@
project('libxcvt', 'c',
version: '0.1.3',
meson_version: '>= 0.40.0',
default_options: ['warning_level=1',
'buildtype=debugoptimized'])
libcvt_version = meson.project_version().split('.')
cc = meson.get_compiler('c')
mdep = cc.find_library('m', required : false)
prefix = get_option('prefix')
inc = include_directories('include')
man = join_paths(prefix, get_option('mandir'))
subdir('include')
subdir('lib')
subdir('cvt')
subdir('man')
pkg_mod = import('pkgconfig')
pkg_mod.generate(libraries : libxcvt,
version : meson.project_version(),
name : 'libxcvt',
description : 'A Library to generate VESA CVT standard timing modelines.')
libxcvt_dep = declare_dependency(link_with: libxcvt,
include_directories: inc)