From 0cbad356389f2f74ee1870a215c0f39f9c2c3718 Mon Sep 17 00:00:00 2001 From: Admin Pupkin Date: Mon, 18 May 2026 14:20:54 +0300 Subject: [PATCH] chore: kernel source patches, local recipe updates, and build artifacts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel source (ephemeral — changes durable in local/patches/kernel/): - P20 x2apic ICR mode fix, P21 x2apic SMP fix applied - ACPI MADT, RSDP, SDT improvements - Context switch, percpu, event, IRQ scheme updates - MSI/vector allocation, NUMA/SLIT/SRAT support Local recipe source updates: - redox-driver-acpi: bus/prt hardening - redox-drm: Intel display, KMS connector improvements - driver-manager: config/scheme hardening - thermald: main.rs fix - uutils-tar, ninja-build: source updates Other: - bootloader, installer, redoxfs, relibc, userutils source updates - recipe.toml.backup, libxcvt source directory --- recipes/core/base/recipe.toml.backup | 260 +++++++ recipes/core/kernel/source/Cargo.toml | 1 + recipes/core/kernel/source/Makefile | 1 + recipes/core/kernel/source/build.rs | 13 + .../kernel/source/src/acpi/madt/arch/x86.rs | 70 +- .../core/kernel/source/src/acpi/madt/mod.rs | 100 +++ recipes/core/kernel/source/src/acpi/mod.rs | 26 + recipes/core/kernel/source/src/acpi/rsdp.rs | 34 +- recipes/core/kernel/source/src/acpi/sdt.rs | 16 + recipes/core/kernel/source/src/acpi/slit.rs | 45 ++ recipes/core/kernel/source/src/acpi/srat.rs | 102 +++ .../core/kernel/source/src/allocator/mod.rs | 40 +- .../kernel/source/src/arch/aarch64/start.rs | 2 +- .../kernel/source/src/arch/riscv64/start.rs | 2 +- .../src/arch/x86_shared/device/ioapic.rs | 133 +++- .../source/src/arch/x86_shared/device/mod.rs | 5 +- .../source/src/arch/x86_shared/device/msi.rs | 183 +++++ .../src/arch/x86_shared/device/vector.rs | 53 ++ .../kernel/source/src/arch/x86_shared/gdt.rs | 14 +- .../kernel/source/src/arch/x86_shared/idt.rs | 17 +- .../arch/x86_shared/interrupt/exception.rs | 50 +- .../kernel/source/src/arch/x86_shared/mod.rs | 2 + .../source/src/arch/x86_shared/sleep.rs | 712 ++++++++++++++++++ .../source/src/arch/x86_shared/start.rs | 39 +- .../source/src/asm/x86_64/s3_wakeup.asm | 110 +++ .../kernel/source/src/context/arch/aarch64.rs | 8 +- .../kernel/source/src/context/arch/riscv64.rs | 4 +- .../kernel/source/src/context/arch/x86.rs | 8 +- .../kernel/source/src/context/arch/x86_64.rs | 7 - .../core/kernel/source/src/context/context.rs | 4 + .../core/kernel/source/src/context/file.rs | 57 +- .../core/kernel/source/src/context/memory.rs | 30 +- recipes/core/kernel/source/src/context/mod.rs | 14 +- .../core/kernel/source/src/context/switch.rs | 119 ++- recipes/core/kernel/source/src/cpu_set.rs | 7 +- recipes/core/kernel/source/src/event.rs | 12 +- recipes/core/kernel/source/src/main.rs | 3 + recipes/core/kernel/source/src/numa.rs | 81 ++ recipes/core/kernel/source/src/percpu.rs | 191 ++++- recipes/core/kernel/source/src/scheme/acpi.rs | 68 +- .../core/kernel/source/src/scheme/debug.rs | 13 +- .../core/kernel/source/src/scheme/event.rs | 54 +- recipes/core/kernel/source/src/scheme/irq.rs | 96 ++- recipes/core/kernel/source/src/scheme/mod.rs | 85 ++- recipes/core/kernel/source/src/scheme/pipe.rs | 581 +++++++++----- recipes/core/kernel/source/src/scheme/proc.rs | 71 +- recipes/core/kernel/source/src/scheme/user.rs | 220 ++++-- .../core/kernel/source/src/startup/memory.rs | 19 +- recipes/core/kernel/source/src/startup/mod.rs | 13 +- recipes/core/kernel/source/src/sync/mcs.rs | 188 +++++ recipes/core/kernel/source/src/sync/mod.rs | 1 + .../core/kernel/source/src/sync/ordered.rs | 142 ++++ recipes/core/kernel/source/src/syscall/fs.rs | 282 ++++--- recipes/core/kernel/source/src/syscall/mod.rs | 9 + .../core/kernel/source/src/syscall/process.rs | 90 ++- recipes/wip/x11/libxcvt/source/.gitlab-ci.yml | 112 +++ .../x11/libxcvt/source/.gitlab-ci/ci.template | 118 +++ .../x11/libxcvt/source/.gitlab-ci/config.yml | 9 + .../libxcvt/source/.gitlab-ci/meson-build.sh | 48 ++ recipes/wip/x11/libxcvt/source/COPYING | 67 ++ recipes/wip/x11/libxcvt/source/README.md | 36 + recipes/wip/x11/libxcvt/source/cvt/cvt.c | 257 +++++++ .../wip/x11/libxcvt/source/cvt/meson.build | 10 + .../libxcvt/source/include/libxcvt/libxcvt.h | 46 ++ .../source/include/libxcvt/libxcvt_mode.h | 56 ++ .../source/include/libxcvt/meson.build | 1 + .../x11/libxcvt/source/include/meson.build | 1 + recipes/wip/x11/libxcvt/source/lib/libxcvt.c | 301 ++++++++ .../wip/x11/libxcvt/source/lib/meson.build | 7 + recipes/wip/x11/libxcvt/source/man/cvt.man | 41 + .../wip/x11/libxcvt/source/man/meson.build | 12 + recipes/wip/x11/libxcvt/source/meson.build | 28 + 72 files changed, 5058 insertions(+), 599 deletions(-) create mode 100644 recipes/core/base/recipe.toml.backup create mode 100644 recipes/core/kernel/source/src/acpi/slit.rs create mode 100644 recipes/core/kernel/source/src/acpi/srat.rs create mode 100644 recipes/core/kernel/source/src/arch/x86_shared/device/msi.rs create mode 100644 recipes/core/kernel/source/src/arch/x86_shared/device/vector.rs create mode 100644 recipes/core/kernel/source/src/arch/x86_shared/sleep.rs create mode 100644 recipes/core/kernel/source/src/asm/x86_64/s3_wakeup.asm create mode 100644 recipes/core/kernel/source/src/numa.rs create mode 100644 recipes/core/kernel/source/src/sync/mcs.rs create mode 100644 recipes/wip/x11/libxcvt/source/.gitlab-ci.yml create mode 100644 recipes/wip/x11/libxcvt/source/.gitlab-ci/ci.template create mode 100644 recipes/wip/x11/libxcvt/source/.gitlab-ci/config.yml create mode 100755 recipes/wip/x11/libxcvt/source/.gitlab-ci/meson-build.sh create mode 100644 recipes/wip/x11/libxcvt/source/COPYING create mode 100644 recipes/wip/x11/libxcvt/source/README.md create mode 100644 recipes/wip/x11/libxcvt/source/cvt/cvt.c create mode 100644 recipes/wip/x11/libxcvt/source/cvt/meson.build create mode 100644 recipes/wip/x11/libxcvt/source/include/libxcvt/libxcvt.h create mode 100644 recipes/wip/x11/libxcvt/source/include/libxcvt/libxcvt_mode.h create mode 100644 recipes/wip/x11/libxcvt/source/include/libxcvt/meson.build create mode 100644 recipes/wip/x11/libxcvt/source/include/meson.build create mode 100644 recipes/wip/x11/libxcvt/source/lib/libxcvt.c create mode 100644 recipes/wip/x11/libxcvt/source/lib/meson.build create mode 100644 recipes/wip/x11/libxcvt/source/man/cvt.man create mode 100644 recipes/wip/x11/libxcvt/source/man/meson.build create mode 100644 recipes/wip/x11/libxcvt/source/meson.build diff --git a/recipes/core/base/recipe.toml.backup b/recipes/core/base/recipe.toml.backup new file mode 100644 index 0000000000..59646cdb28 --- /dev/null +++ b/recipes/core/base/recipe.toml.backup @@ -0,0 +1,260 @@ +[source] +git = "https://gitlab.redox-os.org/redox-os/base.git" +rev = "463f76b9608a896e6f6c9f63457f57f6409873c7" +patches = [ + "P0-daemon-fix-init-notify-unwrap.patch", + "P0-workspace-add-bootstrap.patch", + "P0-init-continuous-scheduling.patch", + "P0-dhcpd-auto-iface.patch", + "P0-procmgr-sigchld-debug.patch", + "P0-pcid-mcfg-diagnostics.patch", + "P0-ihdgd-intel-gpu-ids.patch", + "P0-acpid-dmar-fix.patch", + # P1: acpid EC runtime and AML physmem hardening (narrow ACPI runtime patches) + "P1-acpid-ec-runtime.patch", + "P1-acpid-runtime-hardening.patch", + # Stale patches needing recreation: P1-pcid-uevent-surface, P2-boot-runtime-fixes, + # P2-hwd-misc, P2-pcid-cfg-access, P3-xhci-device-hardening, P6-cpufreqd-real-impl + "P2-i2c-gpio-ucsi-drivers.patch", + "P0-i2c-control-response-empty.patch", + "P2-ihdad-graceful-init.patch", + "P2-boot-logging.patch", + "P2-init-acpid-wiring.patch", + "P2-hwd-remove-acpid-spawn.patch", + "P2-initfs-pcid-service.patch", + "P2-misc-daemon-fixes.patch", + "P9-fix-so-pecred.patch", + "P3-inputd-keymap-bridge.patch", + # P3: ps2d consolidated — LED feedback, mouse resend, fastfail, Intellimouse2, controller init robustness, non-x86 fallback + "P7-ps2d-intellimouse2-leds-controller-init.patch", + "P3-usbhidd-hardening.patch", + "P3-init-colored-output.patch", + "P4-logd-persistent-logging.patch", + "P4-acpi-shutdown-hardening.patch", + "P4-acpi-s3-sleep.patch", + "P4-pcid-public-client-channel.patch", + "P4-pcid-config-scheme.patch", + "P4-pcid-spawner-pci-coordinate-env.patch", + "P4-initfs-usb-drm-services.patch", + "P4-initfs-release-virtio-gpu.patch", + "P4-initfs-network-services.patch", + "P4-initfs-getty-services.patch", + "P4-initfs-dbus-services.patch", + "P4-fbcond-scrollback.patch", + # P4: ucsid graceful ESTALE/ENOENT handling — don't crash when /scheme/acpi/symbols unavailable + "P4-ucsid-estale-graceful.patch", + # P4: Extend ESTALE/ENOENT graceful handling to all ACPI-reading daemons + "P4-acpi-estale-graceful.patch", + # P4: hwd graceful ESTALE/ENOENT handling in probe() + "P4-hwd-estale-graceful.patch", + # P5: i2c-hidd + intel-thc-hidd: boot-time ESTALE/ENOENT retry with exponential backoff + "P5-i2c-hidd-estale-retry.patch", + # P5: acpid /scheme/acpi/dmi SMBIOS endpoint for quirk matching + "P5-acpid-dmi-endpoint.patch", + "P4-thermal-daemon.patch", + "P4-thermald-workspace.patch", + "P6-driver-main-fixes.patch", + "P6-driver-new-modules.patch", + "P9-init-scheduler-completed.patch", + "P2-pcid-acpid-graceful-fd.patch", + # P5: Graceful DRM ioctl error handling in fbbootlogd/fbcond (avoid ENOTTY crash) + "P5-fbbootlogd-fbcond-graceful-drm.patch", + # P6: Fix rtcd EEXIST by avoiding O_CREAT on kernel scheme resource + "P6-rtcd-no-ocreat.patch", + # P6: Init hard requires dependency — blocks startup if dependency missing + "P6-init-requires-hard-dep.patch", + # P6: Fix pcid→acpid FD transfer — pass FD in metadata array, not payload + "P6-pcid-acpid-fd-transfer.patch", + # P7: Fix acpid pci_fd startup race — shared RwLock between scheme and AML handler + "P7-acpid-shared-pcifd.patch", + # P15: Init service timeout — prevent boot hanging on unresponsive daemons (30s default) + "P15-7-init-service-timeout.patch", + # P15: Dependency cycle detection in unit loader — log and skip circular requires_weak + "P15-8-init-cycle-detection.patch", + # P18: Init daemon restart policy — supervise Notify/Scheme services with exponential backoff + "P18-1-daemon-restart.patch", + # P18: ACPID robustness — RSDP BIOS-area fallback, graceful physmem error handling + "P18-5-acpid-robustness.patch", + # P18: MSI/MSI-X enablement — skip legacy IRQ for MSI-capable devices + "P18-3-msi-msix-enablement.patch", + # P18: Bounded IPC queues — backlog limits for chan, UDS stream, UDS dgram + "P18-8-bounded-ipcd-queues.patch", + # P18: MSI/MSI-X allocation resilience — handle EEXIST, fallback chain MSI-X→MSI→legacy + "P18-9-msi-allocation-resilience.patch", +] + +[package] +installs = [ + "/lib/pcid.d/ac97d.toml", + "/lib/pcid.d/e1000d.toml", + "/lib/pcid.d/ihdad.toml", + "/lib/pcid.d/ihdgd.toml", + "/lib/pcid.d/ixgbed.toml", + "/lib/pcid.d/rtl8139d.toml", + "/lib/pcid.d/rtl8168d.toml", + "/lib/pcid.d/vboxd.toml", + "/lib/pcid.d/virtio-netd.toml", + "/lib/pcid.d/xhcid.toml", + "/usr/bin/audiod", + "/usr/bin/dhcpd", + "/usr/bin/dw-acpi-i2cd", + "/usr/bin/gpiod", + "/usr/bin/i2cd", + "/usr/bin/i2c-gpio-expanderd", + "/usr/bin/i2c-hidd", + "/usr/bin/inputd", + "/usr/bin/intel-gpiod", + "/usr/bin/ipcd", + "/usr/bin/netstack", + "/usr/bin/pcid", + "/usr/bin/pcid-spawner", + "/usr/bin/ptyd", + "/usr/bin/redoxerd", + "/usr/bin/smolnetd", + "/usr/bin/ucsid", + "/usr/lib/drivers/ac97d", + "/usr/lib/drivers/ahcid", + "/usr/lib/drivers/amd-mp2-i2cd", + "/usr/lib/drivers/e1000d", + "/usr/lib/drivers/ihdad", + "/usr/lib/drivers/ihdgd", + "/usr/lib/drivers/ided", + "/usr/lib/drivers/intel-lpss-i2cd", + "/usr/lib/drivers/intel-thc-hidd", + "/usr/lib/drivers/ixgbed", + "/usr/lib/drivers/ps2d", + "/usr/lib/drivers/rtl8139d", + "/usr/lib/drivers/rtl8168d", + "/usr/lib/drivers/sb16d", + "/usr/lib/drivers/thermald", + "/usr/lib/drivers/usbctl", + "/usr/lib/drivers/usbhidd", + "/usr/lib/drivers/usbhubd", + "/usr/lib/drivers/usbscsid", + "/usr/lib/drivers/vboxd", + "/usr/lib/drivers/virtio-gpud", + "/usr/lib/drivers/virtio-netd", + "/usr/lib/drivers/xhcid", + "/usr/lib/init.d/00_base.target", + "/usr/lib/init.d/00_ipcd.service", + "/usr/lib/init.d/00_pcid-spawner.service", + "/usr/lib/init.d/00_ptyd.service", + "/usr/lib/init.d/00_sudo.service", + "/usr/lib/init.d/00_tmp", + "/usr/lib/init.d/05_boot_essential.target", + "/usr/lib/init.d/10_dhcpd.service", + "/usr/lib/init.d/10_net.target", + "/usr/lib/init.d/10_smolnetd.service", + "/usr/lib/init.d/12_boot_late.target", + "/usr/lib/init.d/12_dbus.service", + "/usr/lib/init.d/13_seatd.service", + "/usr/lib/init.d/13_sessiond.service", + "/usr/lib/init.d/20_audiod.service", + "/usr/lib/init.d/29_activate_console.service", + "/usr/lib/init.d/30_console.service", + "/usr/lib/init.d/30_thermald.service", + "/usr/lib/init.d/31_debug_console.service", +] + +[build] +template = "custom" +script = """ +mkdir -pv "${COOKBOOK_STAGE}/usr/bin" +for package in audiod ipcd ptyd dhcpd; do + "${COOKBOOK_CARGO}" build \ + --manifest-path "${COOKBOOK_SOURCE}/${package}/Cargo.toml" \ + --target "${TARGET}" \ + ${build_flags} + cp -v \ + "target/${TARGET}/${build_type}/${package}" \ + "${COOKBOOK_STAGE}/usr/bin/${package}" +done + +"${COOKBOOK_CARGO}" build \ + --manifest-path "${COOKBOOK_SOURCE}/netstack/Cargo.toml" \ + --target "${TARGET}" \ + ${build_flags} +cp -v \ + "target/${TARGET}/${build_type}/netstack" \ + "${COOKBOOK_STAGE}/usr/bin/netstack" +cp -v \ + "target/${TARGET}/${build_type}/netstack" \ + "${COOKBOOK_STAGE}/usr/bin/smolnetd" + +# Drivers that are built on all architectures, and NOT in drivers-initfs +BINS=( + gpiod + i2c-gpio-expanderd + intel-gpiod + amd-mp2-i2cd + dw-acpi-i2cd + e1000d + ihdad + ihdgd + i2c-hidd + intel-thc-hidd + intel-lpss-i2cd + ixgbed + pcid + pcid-spawner + rtl8139d + rtl8168d + usbctl + usbhidd + thermald + usbhubd + ucsid + usbscsid + virtio-gpud + virtio-netd + xhcid + i2cd + inputd + redoxerd +) + +# Add additional drivers to the list to build, that are not in drivers-initfs +# depending on the target architecture +case "${TARGET}" in + i586-unknown-redox | i686-unknown-redox | x86_64-unknown-redox) + BINS+=(ac97d ahcid ided nvmed ps2d sb16d vboxd) + ;; + *) + ;; +esac + +#Build each driver in the list +mkdir -pv "${COOKBOOK_STAGE}/usr/bin" "${COOKBOOK_STAGE}/usr/lib/drivers" +export CARGO_PROFILE_RELEASE_OPT_LEVEL=s +export CARGO_PROFILE_RELEASE_PANIC=abort +# Only build drivers that actually have source Cargo.toml entries +EXISTING_BINS=() +for bin in "${BINS[@]}" +do + if grep -Rqs "^name = \\\"${bin}\\\"$" "${COOKBOOK_SOURCE}"; then + EXISTING_BINS+=("${bin}") + fi +done +"${COOKBOOK_CARGO}" build ${build_flags} \ + --manifest-path "${COOKBOOK_SOURCE}/Cargo.toml" \ + --target "${TARGET}" \ + $(for bin in "${EXISTING_BINS[@]}"; do echo "-p" "${bin}"; done) +for bin in "${EXISTING_BINS[@]}" +do + if [[ "${bin}" == "gpiod" || "${bin}" == "i2c-gpio-expanderd" || "${bin}" == "intel-gpiod" || "${bin}" == "i2cd" || "${bin}" == "dw-acpi-i2cd" || "${bin}" == "i2c-hidd" || "${bin}" == "inputd" || "${bin}" == "pcid" || "${bin}" == "pcid-spawner" || "${bin}" == "redoxerd" || "${bin}" == "ucsid" ]]; then + cp -v "target/${TARGET}/${build_type}/${bin}" "${COOKBOOK_STAGE}/usr/bin" + else + cp -v "target/${TARGET}/${build_type}/${bin}" "${COOKBOOK_STAGE}/usr/lib/drivers" + fi +done + +mkdir -pv "${COOKBOOK_STAGE}/lib/pcid.d" +find "${COOKBOOK_SOURCE}/drivers" -maxdepth 3 -type f -name 'config.toml' | while read conf +do + driver="$(basename "$(dirname "$conf")")" + cp -v "$conf" "${COOKBOOK_STAGE}/lib/pcid.d/$driver.toml" +done + +mkdir -pv "${COOKBOOK_STAGE}/usr/lib/init.d" +cp -v "${COOKBOOK_SOURCE}/init.d"/* "${COOKBOOK_STAGE}/usr/lib/init.d/" +""" diff --git a/recipes/core/kernel/source/Cargo.toml b/recipes/core/kernel/source/Cargo.toml index 6d4f059ace..e05f723c88 100644 --- a/recipes/core/kernel/source/Cargo.toml +++ b/recipes/core/kernel/source/Cargo.toml @@ -12,6 +12,7 @@ cc = "1.0" toml = "0.8" [dependencies] +acpi_ext = { package = "acpi", git = "https://gitlab.redox-os.org/redox-os/acpi.git", branch = "redox-6.x" } arrayvec = { version = "0.7.4", default-features = false } bitfield = "0.13.2" bitflags = "2" diff --git a/recipes/core/kernel/source/Makefile b/recipes/core/kernel/source/Makefile index 68a8c50ae5..ce59b910b5 100644 --- a/recipes/core/kernel/source/Makefile +++ b/recipes/core/kernel/source/Makefile @@ -1,3 +1,4 @@ +# Red Bear OS kernel patches applied via individual patch files .PHONY: all check SOURCE:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) diff --git a/recipes/core/kernel/source/build.rs b/recipes/core/kernel/source/build.rs index 96c3ea5c78..751746ccdd 100644 --- a/recipes/core/kernel/source/build.rs +++ b/recipes/core/kernel/source/build.rs @@ -77,6 +77,7 @@ fn main() { } "x86_64" => { println!("cargo::rerun-if-changed=src/asm/x86_64/trampoline.asm"); + println!("cargo::rerun-if-changed=src/asm/x86_64/s3_wakeup.asm"); let status = Command::new("nasm") .arg("-f") @@ -89,6 +90,18 @@ fn main() { if !status.success() { panic!("nasm failed with exit status {}", status); } + + let status = Command::new("nasm") + .arg("-f") + .arg("bin") + .arg("-o") + .arg(format!("{}/s3_wakeup", out_dir)) + .arg("src/asm/x86_64/s3_wakeup.asm") + .status() + .expect("failed to run nasm"); + if !status.success() { + panic!("nasm failed with exit status {}", status); + } } "riscv64" => { println!("cargo::rustc-cfg=dtb"); diff --git a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs index a4d5a98b23..306ec15442 100644 --- a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs +++ b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs @@ -189,8 +189,18 @@ pub(super) fn init(madt: Madt) { let preliminary_cpu_count = madt .iter() .filter(|entry| match entry { - MadtEntry::LocalApic(local) => u32::from(local.id) == me.get() || local.flags & 1 == 1, - MadtEntry::LocalX2Apic(local) => local.x2apic_id == me.get() || local.flags & 1 == 1, + // When x2APIC is active, LocalApic entries use 8-bit IDs that don't + // match the BSP's 32-bit x2APIC ID. Use LocalX2Apic entries instead. + MadtEntry::LocalApic(local) if !local_apic.x2 => { + u32::from(local.id) == me.get() || local.flags & 1 == 1 + } + MadtEntry::LocalApic(_) => false, + // xAPIC mode: cannot use 32-bit x2APIC IDs via 8-bit ICR. + // Skip LocalX2Apic entries and use LocalApic exclusively. + MadtEntry::LocalX2Apic(local) if local_apic.x2 => { + local.x2apic_id == me.get() || local.flags & 1 == 1 + } + MadtEntry::LocalX2Apic(_) => false, _ => false, }) .count(); @@ -205,18 +215,28 @@ pub(super) fn init(madt: Madt) { let _ = seen_apic_ids.insert(me.get()); // BSP for entry in madt.iter() { match entry { - MadtEntry::LocalApic(local) if local.flags & 1 == 1 => { + MadtEntry::LocalApic(local) if local.flags & 1 == 1 && !local_apic.x2 => { let id = u32::from(local.id); if !seen_apic_ids.insert(id) { warn!("MADT: duplicate APIC ID {} in LocalApic entry, firmware bug", id); } } - MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 => { + MadtEntry::LocalApic(local) if local.flags & 1 == 1 && local_apic.x2 => { + // x2APIC mode: skip 8-bit LocalApic IDs; they conflict with + // 32-bit x2APIC IDs. Dedup only among LocalX2Apic entries. + debug!("MADT: ignoring 8-bit LocalApic ID {} in x2APIC mode", local.id); + } + MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 && local_apic.x2 => { let id = local.x2apic_id; if !seen_apic_ids.insert(id) { warn!("MADT: duplicate x2APIC ID {} in LocalX2Apic entry, firmware bug", id); } } + MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 && !local_apic.x2 => { + // xAPIC mode: skip 32-bit x2APIC IDs; dedup only among LocalApic entries. + let id = local.x2apic_id; // Copy from packed struct + debug!("MADT: ignoring 32-bit x2APIC ID {} in xAPIC mode", id); + } _ => {} } } @@ -225,7 +245,16 @@ pub(super) fn init(madt: Madt) { for madt_entry in madt.iter() { debug!(" {:x?}", madt_entry); if let MadtEntry::LocalApic(ap_local_apic) = madt_entry { - if u32::from(ap_local_apic.id) == me.get() { + // x2APIC mode: LocalApic entries have 8-bit IDs that don't match + // the BSP's 32-bit x2APIC ID. All entries would be treated as APs, + // and SIPI would target the wrong processors. Skip them and rely + // on LocalX2Apic entries exclusively. + if local_apic.x2 { + debug!( + " Skipping 8-bit LocalApic id={} (x2APIC active, using LocalX2Apic entries)", + ap_local_apic.id + ); + } else if u32::from(ap_local_apic.id) == me.get() { debug!(" This is my local APIC"); } else if ap_local_apic.flags & 1 == 1 { // Allocate a stack @@ -383,14 +412,19 @@ pub(super) fn init(madt: Madt) { } RmmA::invalidate_all(); - } else { - debug!("KERNEL AP: LAPIC CPU {} disabled in MADT, skipping", u32::from(ap_local_apic.id)); } } else if let MadtEntry::LocalX2Apic(ap_x2apic) = madt_entry { let apic_id = ap_x2apic.x2apic_id; let flags = ap_x2apic.flags; - if apic_id == me.get() { + // xAPIC mode: cannot target 32-bit x2APIC IDs via 8-bit ICR. + // Skip LocalX2Apic entries; use LocalApic entries exclusively. + if !local_apic.x2 { + debug!( + " Skipping 32-bit x2APIC id={} (xAPIC mode, using LocalApic entries)", + apic_id + ); + } else if apic_id == me.get() { debug!(" This is my local x2APIC"); } else if flags & 1 == 1 { let alloc = match allocate_p2frame(4) { @@ -446,11 +480,7 @@ pub(super) fn init(madt: Madt) { // Send INIT IPI (Assert) { let mut icr = 0x4500u64; - if local_apic.x2 { - icr |= u64::from(apic_id) << 32; - } else { - icr |= u64::from(apic_id as u8) << 56; - } + icr |= u64::from(apic_id) << 32; local_apic.set_icr(icr); } @@ -461,11 +491,7 @@ pub(super) fn init(madt: Madt) { { let ap_segment = (TRAMPOLINE >> 12) & 0xFF; let mut icr = 0x0600u64 | ap_segment as u64; - if local_apic.x2 { - icr |= u64::from(apic_id) << 32; - } else { - icr |= u64::from(apic_id as u8) << 56; - } + icr |= u64::from(apic_id) << 32; local_apic.set_icr(icr); } @@ -476,11 +502,7 @@ pub(super) fn init(madt: Madt) { { let ap_segment = (TRAMPOLINE >> 12) & 0xFF; let mut icr = 0x0600u64 | ap_segment as u64; - if local_apic.x2 { - icr |= u64::from(apic_id) << 32; - } else { - icr |= u64::from(apic_id as u8) << 56; - } + icr |= u64::from(apic_id) << 32; local_apic.set_icr(icr); } @@ -534,8 +556,6 @@ pub(super) fn init(madt: Madt) { } RmmA::invalidate_all(); - } else { - debug!("KERNEL AP: x2APIC CPU {} disabled in MADT (flags={:#x}), skipping", apic_id, flags); } } else if let MadtEntry::LocalApicNmi(nmi) = madt_entry { let target_apic = nmi.processor; diff --git a/recipes/core/kernel/source/src/acpi/madt/mod.rs b/recipes/core/kernel/source/src/acpi/madt/mod.rs index 3159b9c497..ed68d6eea8 100644 --- a/recipes/core/kernel/source/src/acpi/madt/mod.rs +++ b/recipes/core/kernel/source/src/acpi/madt/mod.rs @@ -34,6 +34,12 @@ impl Madt { let madt = Madt::new(find_one_sdt!("APIC")); if let Some(madt) = madt { + // Validate MADT checksum per ACPI 6.5 §5.2.2 + if !madt.sdt.validate_checksum() { + error!("MADT checksum validation failed, skipping APIC initialization"); + return; + } + // safe because no APs have been started yet. unsafe { MADT.get().write(Some(madt)) }; @@ -146,6 +152,48 @@ pub struct MadtGicd { _reserved2: [u8; 3], } +/// MADT Local x2APIC (entry type 0x9) +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct MadtLocalX2Apic { + _reserved: u16, + pub x2apic_id: u32, + pub flags: u32, + pub processor_uid: u32, +} + +/// MADT Local APIC NMI (entry type 0x4) +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct MadtLocalApicNmi { + pub processor: u8, + pub flags: u16, + pub nmi_pin: u8, +} + +/// MADT Local APIC address override (entry type 0x5) +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct MadtLapicAddressOverride { + _reserved: u16, + pub local_apic_address: u64, +} + +/// MADT Local x2APIC NMI (entry type 0xA) +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct MadtLocalX2ApicNmi { + _reserved: u16, + pub processor_uid: u32, + pub flags: u16, + pub nmi_pin: u8, + _reserved2: u8, +} + +const _: () = assert!(size_of::() == 4); +const _: () = assert!(size_of::() == 10); +const _: () = assert!(size_of::() == 10); + /// MADT Entries #[derive(Debug)] #[allow(dead_code)] @@ -156,10 +204,18 @@ pub enum MadtEntry { InvalidIoApic(usize), IntSrcOverride(&'static MadtIntSrcOverride), InvalidIntSrcOverride(usize), + LocalApicNmi(&'static MadtLocalApicNmi), + InvalidLocalApicNmi(usize), + LapicAddressOverride(&'static MadtLapicAddressOverride), + InvalidLapicAddressOverride(usize), Gicc(&'static MadtGicc), InvalidGicc(usize), Gicd(&'static MadtGicd), InvalidGicd(usize), + LocalX2Apic(&'static MadtLocalX2Apic), + InvalidLocalX2Apic(usize), + LocalX2ApicNmi(&'static MadtLocalX2ApicNmi), + InvalidLocalX2ApicNmi(usize), Unknown(u8), } @@ -176,6 +232,10 @@ impl Iterator for MadtIter { let entry_len = unsafe { *(self.sdt.data_address() as *const u8).add(self.i + 1) } as usize; + if entry_len < 2 { + return None; + } + if self.i + entry_len <= self.sdt.data_len() { let item = match entry_type { 0x0 => { @@ -206,6 +266,46 @@ impl Iterator for MadtIter { MadtEntry::InvalidIntSrcOverride(entry_len) } } + 0x4 => { + if entry_len == size_of::() + 2 { + MadtEntry::LocalApicNmi(unsafe { + &*((self.sdt.data_address() + self.i + 2) + as *const MadtLocalApicNmi) + }) + } else { + MadtEntry::InvalidLocalApicNmi(entry_len) + } + } + 0x5 => { + if entry_len == size_of::() + 2 { + MadtEntry::LapicAddressOverride(unsafe { + &*((self.sdt.data_address() + self.i + 2) + as *const MadtLapicAddressOverride) + }) + } else { + MadtEntry::InvalidLapicAddressOverride(entry_len) + } + } + 0x9 => { + if entry_len == size_of::() + 2 { + MadtEntry::LocalX2Apic(unsafe { + &*((self.sdt.data_address() + self.i + 2) + as *const MadtLocalX2Apic) + }) + } else { + MadtEntry::InvalidLocalX2Apic(entry_len) + } + } + 0xA => { + if entry_len == size_of::() + 2 { + MadtEntry::LocalX2ApicNmi(unsafe { + &*((self.sdt.data_address() + self.i + 2) + as *const MadtLocalX2ApicNmi) + }) + } else { + MadtEntry::InvalidLocalX2ApicNmi(entry_len) + } + } 0xB => { if entry_len >= size_of::() + 2 { MadtEntry::Gicc(unsafe { diff --git a/recipes/core/kernel/source/src/acpi/mod.rs b/recipes/core/kernel/source/src/acpi/mod.rs index 59e3526544..d6a744ef90 100644 --- a/recipes/core/kernel/source/src/acpi/mod.rs +++ b/recipes/core/kernel/source/src/acpi/mod.rs @@ -20,6 +20,8 @@ mod rxsdt; pub mod sdt; #[cfg(target_arch = "aarch64")] mod spcr; +pub mod slit; +pub mod srat; mod xsdt; unsafe fn map_linearly(addr: PhysicalAddress, len: usize, mapper: &mut crate::memory::PageMapper) { @@ -82,6 +84,14 @@ impl Rxsdt for RxsdtEnum { pub static RXSDT_ENUM: Once = Once::new(); +#[derive(Clone, Copy, Debug)] +pub struct AcpiRootInfo { + pub revision: u8, + pub root_sdt_address: PhysicalAddress, +} + +pub static ACPI_ROOT_INFO: Once = Once::new(); + /// Parse the ACPI tables to gather CPU, interrupt, and timer information pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) { unsafe { @@ -94,6 +104,15 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) { let rsdp_opt = Rsdp::get_rsdp(already_supplied_rsdp); if let Some(rsdp) = rsdp_opt { + let root_info = ACPI_ROOT_INFO.call_once(|| AcpiRootInfo { + revision: rsdp.revision(), + root_sdt_address: rsdp.sdt_address(), + }); + + if root_info.root_sdt_address != rsdp.sdt_address() || root_info.revision != rsdp.revision() { + error!("ACPI_ROOT_INFO already initialized with a different RSDP root"); + } + debug!("SDT address: {:#x}", rsdp.sdt_address().data()); let rxsdt = get_sdt(rsdp.sdt_address(), &mut KernelMapper::lock_rw()); @@ -146,7 +165,14 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) { // TODO: Enumerate processors in userspace, and then provide an ACPI-independent interface // to initialize enumerated processors to userspace? + // Parse SRAT BEFORE MADT so NUMA node mapping is available + // when APs are started and PercpuBlocks are created. + srat::init(); + Madt::init(); + + // Parse SLIT after MADT for the NUMA distance matrix. + slit::init(); //TODO: support this on any arch // SPCR must be initialized after MADT for interrupt controllers #[cfg(target_arch = "aarch64")] diff --git a/recipes/core/kernel/source/src/acpi/rsdp.rs b/recipes/core/kernel/source/src/acpi/rsdp.rs index f10c5ac989..94e8603743 100644 --- a/recipes/core/kernel/source/src/acpi/rsdp.rs +++ b/recipes/core/kernel/source/src/acpi/rsdp.rs @@ -17,9 +17,33 @@ pub struct Rsdp { impl Rsdp { pub unsafe fn get_rsdp(already_supplied_rsdp: Option<*const u8>) -> Option { - already_supplied_rsdp.map(|rsdp_ptr| { - // TODO: Validate - unsafe { *(rsdp_ptr as *const Rsdp) } + already_supplied_rsdp.and_then(|rsdp_ptr| { + let rsdp = unsafe { *(rsdp_ptr as *const Rsdp) }; + + // Validate signature "RSD PTR " + if &rsdp.signature != b"RSD PTR " { + return None; + } + + // ACPI 1.0 checksum: sum of first 20 bytes must be zero + let bytes_v1 = unsafe { core::slice::from_raw_parts(rsdp_ptr, 20) }; + if bytes_v1.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 { + return None; + } + + // ACPI 2.0+ extended checksum: sum of entire table (length bytes) must be zero + if rsdp.revision >= 2 { + let full_len = rsdp._length as usize; + if full_len < 36 || full_len > 256 { + return None; + } + let bytes_full = unsafe { core::slice::from_raw_parts(rsdp_ptr, full_len) }; + if bytes_full.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 { + return None; + } + } + + Some(rsdp) }) } @@ -31,4 +55,8 @@ impl Rsdp { self.rsdt_address as usize }) } + + pub fn revision(&self) -> u8 { + self.revision + } } diff --git a/recipes/core/kernel/source/src/acpi/sdt.rs b/recipes/core/kernel/source/src/acpi/sdt.rs index 83ff67dac1..2f1f54cd9b 100644 --- a/recipes/core/kernel/source/src/acpi/sdt.rs +++ b/recipes/core/kernel/source/src/acpi/sdt.rs @@ -24,4 +24,20 @@ impl Sdt { let header_size = size_of::(); total_size.saturating_sub(header_size) } + + /// Validate the SDT checksum. + /// + /// Per ACPI 6.5 §5.2.2: the entire table (including the checksum field) + /// must sum to 0 when all bytes are added together as unsigned 8-bit values. + pub fn validate_checksum(&self) -> bool { + let ptr = self as *const _ as *const u8; + let len = self.length as usize; + if len < size_of::() { + return false; + } + let sum = unsafe { core::slice::from_raw_parts(ptr, len) } + .iter() + .fold(0u8, |acc, &b| acc.wrapping_add(b)); + sum == 0 + } } diff --git a/recipes/core/kernel/source/src/acpi/slit.rs b/recipes/core/kernel/source/src/acpi/slit.rs new file mode 100644 index 0000000000..605f303390 --- /dev/null +++ b/recipes/core/kernel/source/src/acpi/slit.rs @@ -0,0 +1,45 @@ +//! SLIT (System Locality Information Table) parser. +//! +//! Parses the NUMA distance matrix for scheduler NUMA-aware work stealing. + +use super::sdt::Sdt; +use crate::acpi::find_sdt; + +const MAX_NODES: usize = 8; + +static mut SLIT_MATRIX: [[u8; MAX_NODES]; MAX_NODES] = [[10u8; MAX_NODES]; MAX_NODES]; +static mut SLIT_NUM_NODES: usize = 0; +static mut SLIT_AVAILABLE: bool = false; + +pub fn is_available() -> bool { unsafe { SLIT_AVAILABLE } } +pub fn num_nodes() -> usize { unsafe { SLIT_NUM_NODES } } + +pub fn distance(from: u8, to: u8) -> u8 { + if !unsafe { SLIT_AVAILABLE } { return 10; } + let (from, to) = (from as usize, to as usize); + if from >= MAX_NODES || to >= MAX_NODES { return 10; } + unsafe { SLIT_MATRIX[from][to] } +} + +pub fn same_socket(node1: u8, node2: u8) -> bool { distance(node1, node2) <= 20 } + +pub fn init() { + let sdt = match find_sdt("SLIT").as_slice() { + [] => return, + [x] => *x, + xs => { println!("SLIT: {} tables found, expected 1", xs.len()); return; } + }; + if &sdt.signature != b"SLIT" { return; } + let data_addr = sdt.data_address(); + let data_len = sdt.data_len(); + if data_len < 8 { return; } + let num_nodes = unsafe { *(data_addr as *const u64) } as usize; + if num_nodes == 0 || num_nodes > MAX_NODES { println!("SLIT: {num_nodes} nodes (max {MAX_NODES}), ignoring"); return; } + let matrix_start = 8; + let matrix_size = num_nodes * num_nodes; + if data_len < matrix_start + matrix_size { println!("SLIT: matrix truncated ({data_len} < {})", matrix_start + matrix_size); return; } + let matrix = unsafe { &mut SLIT_MATRIX }; + for i in 0..num_nodes { for j in 0..num_nodes { matrix[i][j] = unsafe { *((data_addr + matrix_start + i * num_nodes + j) as *const u8) }; } } + unsafe { SLIT_NUM_NODES = num_nodes; SLIT_AVAILABLE = true; } + debug!("SLIT: {} nodes, distance matrix loaded", num_nodes); +} diff --git a/recipes/core/kernel/source/src/acpi/srat.rs b/recipes/core/kernel/source/src/acpi/srat.rs new file mode 100644 index 0000000000..49b3ac0ac7 --- /dev/null +++ b/recipes/core/kernel/source/src/acpi/srat.rs @@ -0,0 +1,102 @@ +//! SRAT (System Resource Affinity Table) parser. +//! +//! Parses CPU-to-NUMA-node and memory-to-NUMA-node affinity information. +//! Called before MADT init so that NUMA data is available during AP startup. + +use super::sdt::Sdt; +use crate::acpi::find_sdt; + +const MAX_CPU_ENTRIES: usize = 256; +const MAX_MEM_ENTRIES: usize = 64; + +#[derive(Clone, Copy)] +struct SratCpuEntry { apic_id: u32, node: u8, enabled: bool } + +#[derive(Clone, Copy)] +struct SratMemEntry { node: u8, base: u64, length: u64, enabled: bool } + +const CPU_NONE: SratCpuEntry = SratCpuEntry { apic_id: u32::MAX, node: 0, enabled: false }; +const MEM_NONE: SratMemEntry = SratMemEntry { node: 0, base: 0, length: 0, enabled: false }; + +static mut SRAT_CPU_ENTRIES: [SratCpuEntry; MAX_CPU_ENTRIES] = [CPU_NONE; MAX_CPU_ENTRIES]; +static mut SRAT_MEM_ENTRIES: [SratMemEntry; MAX_MEM_ENTRIES] = [MEM_NONE; MAX_MEM_ENTRIES]; +static mut SRAT_CPU_COUNT: usize = 0; +static mut SRAT_MEM_COUNT: usize = 0; +static mut SRAT_AVAILABLE: bool = false; + +pub fn is_available() -> bool { unsafe { SRAT_AVAILABLE } } + +pub fn numa_node_for_apic(apic_id: u32) -> Option { + if !unsafe { SRAT_AVAILABLE } { return None; } + let count = unsafe { SRAT_CPU_COUNT }; + let entries = unsafe { &SRAT_CPU_ENTRIES }; + for i in 0..count { + if entries[i].apic_id == apic_id && entries[i].enabled { return Some(entries[i].node); } + } + None +} + +pub fn numa_node_count() -> usize { + if !unsafe { SRAT_AVAILABLE } { return 1; } + let mut max_node: u8 = 0; + let count = unsafe { SRAT_CPU_COUNT }; + let entries = unsafe { &SRAT_CPU_ENTRIES }; + for i in 0..count { if entries[i].enabled && entries[i].node > max_node { max_node = entries[i].node; } } + (max_node as usize) + 1 +} + +#[repr(C, packed)] +struct SratLocalApic { _proximity_lo: u8, apic_id: u8, flags: u32, _local_sapic_eid: u8, _proximity_hi: [u8; 3], _clock_domain: u32 } + +#[repr(C, packed)] +struct SratMemoryAffinity { proximity_domain: u32, _reserved1: u16, base_address_lo: u32, base_address_hi: u32, length_lo: u32, length_hi: u32, _reserved2: u32, flags: u32, _reserved3: u64 } + +#[repr(C, packed)] +struct SratLocalX2Apic { _reserved: u16, proximity_domain: u32, x2apic_id: u32, flags: u32, _clock_domain: u32, _reserved2: u32 } + +pub fn init() { + let sdt = match find_sdt("SRAT").as_slice() { + [] => return, + [x] => *x, + xs => { println!("SRAT: {} tables found, expected 1", xs.len()); return; } + }; + if &sdt.signature != b"SRAT" { return; } + let data_addr = sdt.data_address(); + let data_len = sdt.data_len(); + if data_len < 12 { println!("SRAT: table too short ({data_len} bytes)"); return; } + let mut offset: usize = 12; + let cpu_entries = unsafe { &mut SRAT_CPU_ENTRIES }; + let mem_entries = unsafe { &mut SRAT_MEM_ENTRIES }; + let mut cpu_count: usize = 0; + let mut mem_count: usize = 0; + while offset + 2 <= data_len { + let entry_type = unsafe { *((data_addr + offset) as *const u8) }; + let entry_len = unsafe { *((data_addr + offset + 1) as *const u8) } as usize; + if entry_len < 2 || offset + entry_len > data_len { break; } + let entry_data = data_addr + offset + 2; + match entry_type { + 0x0 if entry_len >= size_of::() + 2 => { + let e = unsafe { &*(entry_data as *const SratLocalApic) }; + let enabled = (e.flags & 1) == 1; + let node = (e._proximity_lo as u32) | ((e._proximity_hi[0] as u32) << 8) | ((e._proximity_hi[1] as u32) << 16) | ((e._proximity_hi[2] as u32) << 24); + if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.apic_id as u32, node: node as u8, enabled }; cpu_count += 1; } + } + 0x1 if entry_len >= size_of::() + 2 => { + let e = unsafe { &*(entry_data as *const SratMemoryAffinity) }; + let enabled = (e.flags & 1) == 1; + let base = (e.base_address_hi as u64) << 32 | e.base_address_lo as u64; + let length = (e.length_hi as u64) << 32 | e.length_lo as u64; + if mem_count < MAX_MEM_ENTRIES { mem_entries[mem_count] = SratMemEntry { node: e.proximity_domain as u8, base, length, enabled }; mem_count += 1; } + } + 0x2 if entry_len >= size_of::() + 2 => { + let e = unsafe { &*(entry_data as *const SratLocalX2Apic) }; + let enabled = (e.flags & 1) == 1; + if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.x2apic_id, node: e.proximity_domain as u8, enabled }; cpu_count += 1; } + } + _ => {} + } + offset += entry_len; + } + unsafe { SRAT_CPU_COUNT = cpu_count; SRAT_MEM_COUNT = mem_count; SRAT_AVAILABLE = true; } + debug!("SRAT: {} CPU entries, {} memory entries", cpu_count, mem_count); +} diff --git a/recipes/core/kernel/source/src/allocator/mod.rs b/recipes/core/kernel/source/src/allocator/mod.rs index 4fdb0ba16e..aaa719635e 100644 --- a/recipes/core/kernel/source/src/allocator/mod.rs +++ b/recipes/core/kernel/source/src/allocator/mod.rs @@ -7,26 +7,40 @@ mod linked_list; /// Size of kernel heap const KERNEL_HEAP_SIZE: usize = ::rmm::MEGABYTE; +#[cold] +fn halt_kernel_heap_init(message: &str) -> ! { + print!("{message}"); + println!("Kernel heap initialization cannot continue. Halting."); + loop { + core::hint::spin_loop(); + } +} + unsafe fn map_heap(mapper: &mut KernelMapper, offset: usize, size: usize) { let mut flush_all = PageFlushAll::new(); let heap_start_page = Page::containing_address(VirtualAddress::new(offset)); let heap_end_page = Page::containing_address(VirtualAddress::new(offset + size - 1)); for page in Page::range_inclusive(heap_start_page, heap_end_page) { - let phys = mapper - .allocator_mut() - .allocate_one() - .expect("failed to allocate kernel heap"); + let phys = match mapper.allocator_mut().allocate_one() { + Some(phys) => phys, + None => halt_kernel_heap_init( + "FATAL: failed to allocate physical frame for kernel heap\n", + ), + }; let flush = unsafe { - mapper - .map_phys( - page.start_address(), - phys, - PageFlags::new() - .write(true) - .global(cfg!(not(feature = "pti"))), - ) - .expect("failed to map kernel heap") + match mapper.map_phys( + page.start_address(), + phys, + PageFlags::new() + .write(true) + .global(cfg!(not(feature = "pti"))), + ) { + Some(flush) => flush, + None => halt_kernel_heap_init( + "FATAL: failed to map kernel heap virtual page\n", + ), + } }; flush_all.consume(flush); } diff --git a/recipes/core/kernel/source/src/arch/aarch64/start.rs b/recipes/core/kernel/source/src/arch/aarch64/start.rs index e1c8cfb4ae..65e3fe339b 100644 --- a/recipes/core/kernel/source/src/arch/aarch64/start.rs +++ b/recipes/core/kernel/source/src/arch/aarch64/start.rs @@ -91,7 +91,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! { dtb::serial::init_early(dtb); } - info!("Redox OS starting..."); + info!("RedBear OS starting..."); args.print(); // Initialize RMM diff --git a/recipes/core/kernel/source/src/arch/riscv64/start.rs b/recipes/core/kernel/source/src/arch/riscv64/start.rs index 2551968f05..a825536aa9 100644 --- a/recipes/core/kernel/source/src/arch/riscv64/start.rs +++ b/recipes/core/kernel/source/src/arch/riscv64/start.rs @@ -97,7 +97,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! { init_early(dtb); } - info!("Redox OS starting..."); + info!("RedBear OS starting..."); args.print(); if let Some(dtb) = &dtb { diff --git a/recipes/core/kernel/source/src/arch/x86_shared/device/ioapic.rs b/recipes/core/kernel/source/src/arch/x86_shared/device/ioapic.rs index fb66d3bf2b..b7656dba57 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/device/ioapic.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/device/ioapic.rs @@ -14,6 +14,10 @@ pub struct IoApicRegs { pointer: *const u32, } impl IoApicRegs { + fn redirection_index_valid(&mut self, idx: u8) -> bool { + idx <= self.max_redirection_table_entries() + } + fn ioregsel(&self) -> *const u32 { self.pointer } @@ -44,21 +48,28 @@ impl IoApicRegs { pub fn read_ioapicver(&mut self) -> u32 { self.read_reg(0x01) } - pub fn read_ioredtbl(&mut self, idx: u8) -> u64 { - assert!(idx < 24); + pub fn read_ioredtbl(&mut self, idx: u8) -> Option { + if !self.redirection_index_valid(idx) { + warn!("IOAPIC read_ioredtbl index {} out of range", idx); + return None; + } let lo = self.read_reg(0x10 + idx * 2); let hi = self.read_reg(0x10 + idx * 2 + 1); - u64::from(lo) | (u64::from(hi) << 32) + Some(u64::from(lo) | (u64::from(hi) << 32)) } - pub fn write_ioredtbl(&mut self, idx: u8, value: u64) { - assert!(idx < 24); + pub fn write_ioredtbl(&mut self, idx: u8, value: u64) -> bool { + if !self.redirection_index_valid(idx) { + warn!("IOAPIC write_ioredtbl index {} out of range", idx); + return false; + } let lo = value as u32; let hi = (value >> 32) as u32; self.write_reg(0x10 + idx * 2, lo); self.write_reg(0x10 + idx * 2 + 1, hi); + true } pub fn max_redirection_table_entries(&mut self) -> u8 { @@ -92,17 +103,37 @@ impl IoApic { } /// Map an interrupt vector to a physical local APIC ID of a processor (thus physical mode). #[allow(dead_code)] - pub fn map(&self, idx: u8, info: MapInfo) { - self.regs.lock().write_ioredtbl(idx, info.as_raw()) + pub fn map(&self, idx: u8, info: MapInfo) -> bool { + let Some(raw) = info.as_raw() else { + return false; + }; + self.regs.lock().write_ioredtbl(idx, raw) } pub fn set_mask(&self, gsi: u32, mask: bool) { let idx = (gsi - self.gsi_start) as u8; let mut guard = self.regs.lock(); - let mut reg = guard.read_ioredtbl(idx); + let Some(mut reg) = guard.read_ioredtbl(idx) else { + return; + }; reg &= !(1 << 16); reg |= u64::from(mask) << 16; - guard.write_ioredtbl(idx, reg); + let _ = guard.write_ioredtbl(idx, reg); + } + /// Change the destination APIC for a GSI by reprogramming the redirection table entry. + /// Preserves all other fields (vector, polarity, trigger mode, delivery mode, mask). + /// Returns true if the entry was successfully updated. + pub fn set_irq_affinity(&self, gsi: u32, dest: ApicId) -> bool { + let idx = (gsi - self.gsi_start) as u8; + let mut guard = self.regs.lock(); + let Some(mut entry) = guard.read_ioredtbl(idx) else { + return false; + }; + // Clear destination field (bits 63:56 for xAPIC physical mode) + // and set new destination APIC ID + entry &= !(0xFF_u64 << 56); + entry |= u64::from(dest.get()) << 56; + guard.write_ioredtbl(idx, entry) } } @@ -149,19 +180,26 @@ pub struct MapInfo { } impl MapInfo { - pub fn as_raw(&self) -> u64 { - assert!(self.vector >= 0x20); - assert!(self.vector <= 0xFE); + pub fn as_raw(&self) -> Option { + if !(0x20..=0xFE).contains(&self.vector) { + warn!( + "Refusing to map IOAPIC vector outside valid range: {:#x}", + self.vector + ); + return None; + } // TODO: Check for reserved fields. - (u64::from(self.dest.get()) << 56) + Some( + (u64::from(self.dest.get()) << 56) | (u64::from(self.mask) << 16) | ((self.trigger_mode as u64) << 15) | ((self.polarity as u64) << 13) | ((self.dest_mode as u64) << 11) | ((self.delivery_mode as u64) << 8) - | u64::from(self.vector) + | u64::from(self.vector), + ) } } @@ -175,7 +213,7 @@ impl fmt::Debug for IoApic { let count = guard.max_redirection_table_entries(); f.debug_list() - .entries((0..count).map(|i| guard.read_ioredtbl(i))) + .entries((0..=count).filter_map(|i| guard.read_ioredtbl(i))) .finish() } } @@ -237,11 +275,14 @@ pub unsafe fn handle_ioapic(madt_ioapic: &'static MadtIoApic) { let ioapic_registers = virt.data() as *const u32; let ioapic = IoApic::new(ioapic_registers, madt_ioapic.gsi_base); - assert_eq!( - ioapic.regs.lock().id(), - madt_ioapic.id, - "mismatched ACPI MADT I/O APIC ID, and the ID reported by the I/O APIC" - ); + let detected_id = ioapic.regs.lock().id(); + if detected_id != madt_ioapic.id { + warn!( + "mismatched ACPI MADT I/O APIC ID: MADT={}, IOAPIC={}; continuing with detected hardware", + madt_ioapic.id, + detected_id + ); + } (*IOAPICS.get()).get_or_insert_with(Vec::new).push(ioapic); } @@ -310,11 +351,11 @@ pub unsafe fn init() { } } } - println!( - "I/O APICs: {:?}, overrides: {:?}", - ioapics(), - src_overrides() - ); + for ioapic in ioapics() { + for idx in 0..=ioapic.count { + ioapic.set_mask(ioapic.gsi_start + u32::from(idx), true); + } + } // map the legacy PC-compatible IRQs (0-15) to 32-47, just like we did with 8259 PIC (if it // wouldn't have been disabled due to this I/O APIC) @@ -329,7 +370,6 @@ pub unsafe fn init() { .iter() .any(|over| over.bus_irq == legacy_irq) { - // there's an IRQ conflict, making this legacy IRQ inaccessible. continue; } ( @@ -349,7 +389,6 @@ pub unsafe fn init() { let redir_tbl_index = (gsi - apic.gsi_start) as u8; let map_info = MapInfo { - // only send to the BSP dest: bsp_apic_id, dest_mode: DestinationMode::Physical, delivery_mode: DeliveryMode::Fixed, @@ -366,7 +405,32 @@ pub unsafe fn init() { }, vector: 32 + legacy_irq, }; - apic.map(redir_tbl_index, map_info); + if !apic.map(redir_tbl_index, map_info) { + warn!( + "Unable to map legacy IRQ {} (GSI {}) through IOAPIC index {}", + legacy_irq, + gsi, + redir_tbl_index + ); + } + + if legacy_irq == 0 && gsi != u32::from(legacy_irq) { + if let Some(apic0) = find_ioapic(u32::from(legacy_irq)) { + let idx0 = (u32::from(legacy_irq) - apic0.gsi_start) as u8; + let _ = apic0.map( + idx0, + MapInfo { + dest: bsp_apic_id, + dest_mode: DestinationMode::Physical, + delivery_mode: DeliveryMode::Fixed, + mask: false, + polarity: ApicPolarity::ActiveHigh, + trigger_mode: ApicTriggerMode::Edge, + vector: 32, + }, + ); + } + } } println!( "I/O APICs: {:?}, overrides: {:?}", @@ -406,7 +470,7 @@ fn resolve(irq: u8) -> u32 { fn find_ioapic(gsi: u32) -> Option<&'static IoApic> { ioapics() .iter() - .find(|apic| gsi >= apic.gsi_start && gsi < apic.gsi_start + u32::from(apic.count)) + .find(|apic| gsi >= apic.gsi_start && gsi <= apic.gsi_start + u32::from(apic.count)) } pub unsafe fn mask(irq: u8) { @@ -425,3 +489,14 @@ pub unsafe fn unmask(irq: u8) { }; apic.set_mask(gsi, false); } + +/// Change the destination CPU for an IRQ by reprogramming the IOAPIC redirection entry. +/// Resolves the legacy IRQ to its GSI, finds the owning IOAPIC, and updates the destination +/// APIC ID in the redirection table while preserving all other fields. +pub unsafe fn set_affinity(irq: u8, dest: ApicId) -> bool { + let gsi = resolve(irq); + match find_ioapic(gsi) { + Some(apic) => apic.set_irq_affinity(gsi, dest), + None => false, + } +} diff --git a/recipes/core/kernel/source/src/arch/x86_shared/device/mod.rs b/recipes/core/kernel/source/src/arch/x86_shared/device/mod.rs index 6f41770601..a1e0b78ad0 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/device/mod.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/device/mod.rs @@ -4,9 +4,11 @@ pub mod cpu; pub mod hpet; pub mod ioapic; pub mod local_apic; +pub mod msi; pub mod pic; pub mod pit; pub mod serial; +pub mod vector; #[cfg(feature = "system76_ec_debug")] pub mod system76_ec; @@ -23,8 +25,7 @@ pub unsafe fn init() { } } pub unsafe fn init_after_acpi() { - // this will disable the IOAPIC if needed. - //ioapic::init(mapper); + unsafe { ioapic::init() }; } unsafe fn init_hpet() -> bool { diff --git a/recipes/core/kernel/source/src/arch/x86_shared/device/msi.rs b/recipes/core/kernel/source/src/arch/x86_shared/device/msi.rs new file mode 100644 index 0000000000..94ab973a26 --- /dev/null +++ b/recipes/core/kernel/source/src/arch/x86_shared/device/msi.rs @@ -0,0 +1,183 @@ +// MSI/MSI-X support for x86 — kernel-level message composition and validation +// Cross-referenced from Linux 7.0: arch/x86/kernel/apic/msi.c (391 lines) + +use crate::arch::device::local_apic::ApicId; + +pub const MSI_ADDRESS_BASE: u64 = 0xFEE0_0000; +pub const MSI_ADDRESS_MASK: u64 = 0xFEEF_F000; +const MSI_DEST_MODE_LOGICAL: u64 = 1 << 2; +const MSI_REDIRECTION_HINT: u64 = 1 << 3; + +#[derive(Debug, Clone, Copy)] +pub struct MsiAddress { + pub raw: u64, +} + +#[derive(Debug, Clone, Copy)] +pub struct MsiData { + pub raw: u32, +} + +#[derive(Debug, Clone)] +pub struct MsiMessage { + pub address: MsiAddress, + pub data: MsiData, +} + +impl MsiAddress { + pub fn new(dest_apic_id: u8, redirection_hint: bool, dest_mode_logical: bool) -> Self { + let mut addr = MSI_ADDRESS_BASE; + addr |= u64::from(dest_apic_id) << 12; + if redirection_hint { + addr |= MSI_REDIRECTION_HINT; + } + if dest_mode_logical { + addr |= MSI_DEST_MODE_LOGICAL; + } + Self { raw: addr } + } + + pub fn validate(addr: u64) -> bool { + (addr & MSI_ADDRESS_MASK) == MSI_ADDRESS_BASE + } + + pub fn dest_apic_id(&self) -> u8 { + ((self.raw >> 12) & 0xFF) as u8 + } +} + +impl MsiData { + pub fn new(vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self { + let mut data = u32::from(vector); + data |= u32::from(delivery_mode) << 8; + data |= u32::from(trigger_mode) << 15; + Self { raw: data } + } + + pub fn vector(&self) -> u8 { + (self.raw & 0xFF) as u8 + } + + pub fn delivery_mode(&self) -> u8 { + ((self.raw >> 8) & 0x7) as u8 + } + + pub fn trigger_mode(&self) -> u8 { + ((self.raw >> 15) & 0x1) as u8 + } +} + +impl MsiMessage { + pub fn compose(dest: ApicId, vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self { + let address = MsiAddress::new(dest.get() as u8, false, false); + let data = MsiData::new(vector, delivery_mode, trigger_mode); + Self { address, data } + } + + pub fn validate(&self) -> bool { + MsiAddress::validate(self.address.raw) + && self.data.vector() >= 32 + && self.data.vector() < 255 + } +} + +pub fn is_valid_msi_address(addr: u64) -> bool { + MsiAddress::validate(addr) +} + +pub fn is_valid_msi_vector(vector: u8) -> bool { + vector >= 32 && vector < 255 +} + +#[derive(Debug)] +pub struct MsiCapability { + pub msg_ctl: u16, + pub msg_addr_lo: u32, + pub msg_addr_hi: u32, + pub msg_data: u16, + pub mask_bits: u32, + pub pending_bits: u32, + pub is_64bit: bool, + pub is_maskable: bool, + pub multiple_message_capable: u8, +} + +impl MsiCapability { + pub fn parse(raw: &[u32; 6], msg_ctl: u16) -> Self { + Self { + msg_ctl, + msg_addr_lo: raw[1], + msg_addr_hi: if msg_ctl & (1 << 7) != 0 { raw[2] } else { 0 }, + msg_data: if msg_ctl & (1 << 7) != 0 { + (raw[3] & 0xFFFF) as u16 + } else { + (raw[2] & 0xFFFF) as u16 + }, + mask_bits: if msg_ctl & (1 << 8) != 0 { + if msg_ctl & (1 << 7) != 0 { + raw[3] >> 16 + } else { + raw[3] + } + } else { + 0 + }, + pending_bits: if msg_ctl & (1 << 8) != 0 { raw[4] } else { 0 }, + is_64bit: msg_ctl & (1 << 7) != 0, + is_maskable: msg_ctl & (1 << 8) != 0, + multiple_message_capable: ((msg_ctl >> 1) & 0x7) as u8, + } + } +} + +#[derive(Debug)] +pub struct MsixCapability { + pub msg_ctl: u16, + pub table_offset: u32, + pub table_bar: u8, + pub pba_offset: u32, + pub pba_bar: u8, + pub table_size: u16, +} + +impl MsixCapability { + pub fn parse(raw: &[u32; 3], msg_ctl: u16) -> Self { + Self { + msg_ctl, + table_offset: raw[1] & !0x7, + table_bar: (raw[1] & 0x7) as u8, + pba_offset: raw[2] & !0x7, + pba_bar: (raw[2] & 0x7) as u8, + table_size: ((msg_ctl >> 1) & 0x7FF) as u16 + 1, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_compose_message() { + let msg = MsiMessage::compose(ApicId::new(3), 48, 0b101, 1); + assert!(msg.validate()); + assert_eq!(msg.address.dest_apic_id(), 3); + assert_eq!(msg.data.vector(), 48); + assert_eq!(msg.data.delivery_mode(), 0b101); + assert_eq!(msg.data.trigger_mode(), 1); + } + + #[test] + fn test_invalid_address() { + assert!(!is_valid_msi_address(0xDEAD_BEEF)); + assert!(is_valid_msi_address(0xFEE0_0000)); + } + + #[test] + fn test_msi_parse() { + let raw = [0u32; 6]; + let cap = MsiCapability::parse(&raw, 0); + assert!(!cap.is_64bit); + assert!(!cap.is_maskable); + } +} diff --git a/recipes/core/kernel/source/src/arch/x86_shared/device/vector.rs b/recipes/core/kernel/source/src/arch/x86_shared/device/vector.rs new file mode 100644 index 0000000000..cd59ac7965 --- /dev/null +++ b/recipes/core/kernel/source/src/arch/x86_shared/device/vector.rs @@ -0,0 +1,53 @@ +use crate::cpu_set::LogicalCpuId; + +const VECTOR_COUNT: usize = 224; + +static VECTORS: [core::sync::atomic::AtomicU32; 7] = [ + core::sync::atomic::AtomicU32::new(0), + core::sync::atomic::AtomicU32::new(0), + core::sync::atomic::AtomicU32::new(0), + core::sync::atomic::AtomicU32::new(0), + core::sync::atomic::AtomicU32::new(0), + core::sync::atomic::AtomicU32::new(0), + core::sync::atomic::AtomicU32::new(0), +]; + +pub fn allocate_vector(_cpu: LogicalCpuId) -> Option { + for (bank, slot) in VECTORS.iter().enumerate() { + let mut bits = slot.load(core::sync::atomic::Ordering::Acquire); + loop { + let free = bits.trailing_ones() as usize; + if free >= 32 { + break; + } + let bit = 1u32 << free; + match slot.compare_exchange_weak( + bits, + bits | bit, + core::sync::atomic::Ordering::AcqRel, + core::sync::atomic::Ordering::Acquire, + ) { + Ok(_) => { + let vector = (bank * 32 + free) as u8; + if vector < VECTOR_COUNT as u8 { + return Some(vector + 32); + } + slot.fetch_and(!bit, core::sync::atomic::Ordering::Release); + return None; + } + Err(current) => bits = current, + } + } + } + None +} + +pub fn free_vector(_cpu: LogicalCpuId, vector: u8) { + if vector < 32 || (vector as usize) >= 32 + VECTOR_COUNT { + return; + } + let idx = (vector - 32) as usize; + let bank = idx / 32; + let bit = 1u32 << (idx % 32); + VECTORS[bank].fetch_and(!bit, core::sync::atomic::Ordering::Release); +} diff --git a/recipes/core/kernel/source/src/arch/x86_shared/gdt.rs b/recipes/core/kernel/source/src/arch/x86_shared/gdt.rs index cad344f3c2..f7acae35f3 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/gdt.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/gdt.rs @@ -192,6 +192,15 @@ impl ProcessorControlRegion { } } +#[cold] +fn halt_pcr_init() -> ! { + println!("FATAL: failed to allocate physical memory for Processor Control Region"); + println!("Processor startup cannot continue. Halting."); + loop { + core::hint::spin_loop(); + } +} + pub unsafe fn pcr() -> *mut ProcessorControlRegion { unsafe { // Primitive benchmarking of RDFSBASE and RDGSBASE in userspace, appears to indicate that @@ -375,7 +384,10 @@ pub fn allocate_and_init_pcr( .next_power_of_two() .trailing_zeros(); - let pcr_frame = crate::memory::allocate_p2frame(alloc_order).expect("failed to allocate PCR"); + let pcr_frame = match crate::memory::allocate_p2frame(alloc_order) { + Some(frame) => frame, + None => halt_pcr_init(), + }; let pcr_ptr = RmmA::phys_to_virt(pcr_frame.base()).data() as *mut ProcessorControlRegion; unsafe { core::ptr::write(pcr_ptr, ProcessorControlRegion::new_partial_init(cpu_id)) }; diff --git a/recipes/core/kernel/source/src/arch/x86_shared/idt.rs b/recipes/core/kernel/source/src/arch/x86_shared/idt.rs index 500645855d..d5af75ddf0 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/idt.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/idt.rs @@ -78,6 +78,15 @@ static INIT_BSP_IDT: SyncUnsafeCell = SyncUnsafeCell::new(Idt::new()); pub(crate) static IDTS: RwLock> = RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); +#[cold] +fn halt_idt_init() -> ! { + println!("FATAL: failed to allocate physical pages for backup interrupt stack"); + println!("Interrupt setup cannot continue. Halting."); + loop { + core::hint::spin_loop(); + } +} + #[inline] pub fn is_reserved(cpu_id: LogicalCpuId, index: u8) -> bool { if cpu_id == LogicalCpuId::BSP { @@ -101,6 +110,8 @@ pub fn set_reserved(cpu_id: LogicalCpuId, index: u8, reserved: bool) { } pub fn available_irqs_iter(cpu_id: LogicalCpuId) -> impl Iterator + 'static { + let count = (32..=254).filter(|&index| !is_reserved(cpu_id, index)).count(); + info!("available_irqs_iter: cpu_id={} count={}", cpu_id.get(), count); (32..=254).filter(move |&index| !is_reserved(cpu_id, index)) } @@ -161,8 +172,10 @@ pub fn allocate_and_init_idt(cpu_id: LogicalCpuId) -> *mut Idt { .or_insert_with(|| Box::leak(Box::new(Idt::new()))); use crate::memory::{RmmA, RmmArch}; - let frames = crate::memory::allocate_p2frame(4) - .expect("failed to allocate pages for backup interrupt stack"); + let frames = match crate::memory::allocate_p2frame(4) { + Some(frames) => frames, + None => halt_idt_init(), + }; // Physical pages are mapped linearly. So is the linearly mapped virtual memory. let base_address = RmmA::phys_to_virt(frames.base()); diff --git a/recipes/core/kernel/source/src/arch/x86_shared/interrupt/exception.rs b/recipes/core/kernel/source/src/arch/x86_shared/interrupt/exception.rs index 7725a45d0a..bfe9f096a2 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/interrupt/exception.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/interrupt/exception.rs @@ -1,3 +1,5 @@ +use core::sync::atomic::{AtomicBool, Ordering}; + use syscall::Exception; use x86::irq::PageFaultError; @@ -10,6 +12,22 @@ use crate::{ syscall::flag::*, }; +static NMI_IN_PROGRESS: AtomicBool = AtomicBool::new(false); + +unsafe fn nmi_raw_serial_write(bytes: &[u8]) { + use crate::syscall::io::{Io, Pio}; + + let mut com1 = Pio::::new(0x3F8); + let lsr = Pio::::new(0x3F8 + 5); + + for &byte in bytes { + while lsr.read() & (1 << 5) == 0 { + core::hint::spin_loop(); + } + com1.write(byte); + } +} + interrupt_stack!(divide_by_zero, |stack| { println!("Divide by zero"); stack.trace(); @@ -55,9 +73,35 @@ interrupt_stack!(non_maskable, @paranoid, |stack| { #[cfg(not(all(target_arch = "x86_64", feature = "profiling")))] { - // TODO: This will likely deadlock - println!("Non-maskable interrupt"); - stack.dump(); + if NMI_IN_PROGRESS.swap(true, Ordering::SeqCst) { + return; + } + + unsafe { + nmi_raw_serial_write(b"Non-maskable interrupt\n"); + nmi_raw_serial_write(b" RIP: "); + + #[cfg(target_arch = "x86")] + let instruction_pointer = u64::from(stack.iret.eip); + #[cfg(target_arch = "x86_64")] + let instruction_pointer = stack.iret.rip; + + let mut buf = [0u8; 19]; + buf[0] = b'0'; + buf[1] = b'x'; + for i in 0..16 { + let nibble = ((instruction_pointer >> (60 - i * 4)) & 0xF) as u8; + buf[2 + i] = if nibble < 10 { + b'0' + nibble + } else { + b'a' + nibble - 10 + }; + } + buf[18] = b'\n'; + nmi_raw_serial_write(&buf); + } + + NMI_IN_PROGRESS.store(false, Ordering::SeqCst); } }); diff --git a/recipes/core/kernel/source/src/arch/x86_shared/mod.rs b/recipes/core/kernel/source/src/arch/x86_shared/mod.rs index e3c30501b8..11c33e9457 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/mod.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/mod.rs @@ -28,6 +28,8 @@ pub mod pti; /// Initialization and start function pub mod start; +pub mod sleep; + /// Stop function pub mod stop; diff --git a/recipes/core/kernel/source/src/arch/x86_shared/sleep.rs b/recipes/core/kernel/source/src/arch/x86_shared/sleep.rs new file mode 100644 index 0000000000..9f98c0d892 --- /dev/null +++ b/recipes/core/kernel/source/src/arch/x86_shared/sleep.rs @@ -0,0 +1,712 @@ +use alloc::{sync::Arc, vec::Vec}; +use core::{ + ptr::NonNull, + str::FromStr, + sync::atomic::{AtomicU32, Ordering}, +}; + +use acpi_ext::{ + aml::{namespace::AmlName, object::Object, Interpreter}, + registers::FixedRegisters, + sdt::{facs::Facs, fadt::Fadt, SdtHeader}, + AcpiTables, Handle, Handler, PhysicalMapping, +}; +use spin::Mutex; +use syscall::error::{Error, EINVAL, EIO}; +use x86::{segmentation::SegmentSelector, task, Ring}; + +use crate::{ + acpi::ACPI_ROOT_INFO, + arch::interrupt, + memory::{ + round_down_pages, round_up_pages, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA, + RmmArch, VirtualAddress, PAGE_SIZE, + }, + syscall::io::{Io, Pio}, +}; + +const ACPI_SLP_TYP_SHIFT: u16 = 10; +const ACPI_SLP_TYP_MASK: u16 = 0x1C00; +const ACPI_SLP_EN: u16 = 1 << 13; +const WAKE_TRAMPOLINE_PHYS: usize = 0x8000; +const SLEEP_RETURN_OK: usize = 0; + +#[cfg(target_arch = "x86_64")] +static WAKE_TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/s3_wakeup")); + +#[repr(C, packed)] +#[derive(Clone, Copy, Debug, Default)] +struct DescriptorTableRegister { + limit: u16, + base: u64, +} + +#[repr(C, align(64))] +#[derive(Clone, Copy, Debug)] +struct FpuState { + bytes: [u8; 4096], +} + +impl Default for FpuState { + fn default() -> Self { + Self { bytes: [0; 4096] } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SleepState { + S3, + S5, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SleepError { + UnsupportedArch, + MissingAcpi, + MissingFadt, + MissingFacs, + MissingSleepObject, + InvalidSleepObject, + UnsupportedPmControl, + UnsupportedAmlOperation, + SleepDidNotEnter, +} + +impl SleepError { + fn code(self) -> usize { + match self { + Self::UnsupportedArch => EINVAL as usize, + Self::MissingAcpi + | Self::MissingFadt + | Self::MissingFacs + | Self::MissingSleepObject + | Self::UnsupportedAmlOperation => EIO as usize, + Self::InvalidSleepObject | Self::UnsupportedPmControl | Self::SleepDidNotEnter => { + EINVAL as usize + } + } + } + + fn from_code(code: usize) -> Self { + match code as i32 { + x if x == EINVAL => Self::InvalidSleepObject, + _ => Self::MissingAcpi, + } + } +} + +#[derive(Clone, Copy, Debug, Default)] +struct SavedCpuContext { + entry_rsp: usize, + runtime_rsp: usize, + facs_address: usize, + cr0: usize, + cr2: usize, + cr3: usize, + cr4: usize, + rflags: usize, + gdtr: DescriptorTableRegister, + idtr: DescriptorTableRegister, + efer: u64, + fs_base: u64, + gs_base: u64, + kernel_gs_base: u64, + fpu: FpuState, +} + +static SAVED_CONTEXT: Mutex> = Mutex::new(None); +static AML_MUTEX_IDS: AtomicU32 = AtomicU32::new(1); + +#[derive(Clone, Copy, Debug)] +struct SleepTypeData { + a: u16, + b: u16, +} + +#[derive(Clone, Copy)] +struct KernelAcpiHandler; + +impl KernelAcpiHandler { + fn map_range(physical_address: usize, size: usize) -> (*mut u8, usize) { + let map_base = round_down_pages(physical_address); + let map_offset = physical_address - map_base; + let mapped_length = round_up_pages(size + map_offset); + + // SAFETY: The ACPI interpreter only requests firmware-described physical regions. + unsafe { + let mut mapper = KernelMapper::lock_rw(); + for page_index in 0..mapped_length / PAGE_SIZE { + let (_, flush) = mapper + .map_linearly( + PhysicalAddress::new(map_base + page_index * PAGE_SIZE), + PageFlags::new(), + ) + .expect("failed to linearly map ACPI physical region"); + flush.flush(); + } + } + + let virtual_base = RmmA::phys_to_virt(PhysicalAddress::new(map_base)).data(); + ((virtual_base + map_offset) as *mut u8, mapped_length) + } +} + +impl Handler for KernelAcpiHandler { + unsafe fn map_physical_region(&self, physical_address: usize, size: usize) -> PhysicalMapping { + let (virtual_start, mapped_length) = Self::map_range(physical_address, size); + PhysicalMapping { + physical_start: physical_address, + virtual_start: NonNull::new(virtual_start.cast::()) + .expect("expected mapped ACPI virtual address to be non-null"), + region_length: size, + mapped_length, + handler: *self, + } + } + + fn unmap_physical_region(_region: &PhysicalMapping) {} + + fn read_u8(&self, address: usize) -> u8 { + // SAFETY: AML system-memory accesses are byte-addressable firmware regions. + unsafe { core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u8) } + } + + fn read_u16(&self, address: usize) -> u16 { + // SAFETY: AML system-memory accesses are word-addressable firmware regions. + unsafe { + core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u16) + } + } + + fn read_u32(&self, address: usize) -> u32 { + // SAFETY: AML system-memory accesses are dword-addressable firmware regions. + unsafe { + core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u32) + } + } + + fn read_u64(&self, address: usize) -> u64 { + // SAFETY: AML system-memory accesses are qword-addressable firmware regions. + unsafe { + core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u64) + } + } + + fn write_u8(&self, address: usize, value: u8) { + // SAFETY: AML system-memory accesses are byte-addressable firmware regions. + unsafe { + core::ptr::write_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u8, value) + } + } + + fn write_u16(&self, address: usize, value: u16) { + // SAFETY: AML system-memory accesses are word-addressable firmware regions. + unsafe { + core::ptr::write_volatile( + RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u16, + value, + ) + } + } + + fn write_u32(&self, address: usize, value: u32) { + // SAFETY: AML system-memory accesses are dword-addressable firmware regions. + unsafe { + core::ptr::write_volatile( + RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u32, + value, + ) + } + } + + fn write_u64(&self, address: usize, value: u64) { + // SAFETY: AML system-memory accesses are qword-addressable firmware regions. + unsafe { + core::ptr::write_volatile( + RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u64, + value, + ) + } + } + + fn read_io_u8(&self, port: u16) -> u8 { + Pio::::new(port).read() + } + + fn read_io_u16(&self, port: u16) -> u16 { + Pio::::new(port).read() + } + + fn read_io_u32(&self, port: u16) -> u32 { + Pio::::new(port).read() + } + + fn write_io_u8(&self, port: u16, value: u8) { + Pio::::new(port).write(value) + } + + fn write_io_u16(&self, port: u16, value: u16) { + Pio::::new(port).write(value) + } + + fn write_io_u32(&self, port: u16, value: u32) { + Pio::::new(port).write(value) + } + + fn read_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u8 { + 0 + } + + fn read_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u16 { + 0 + } + + fn read_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u32 { + 0 + } + + fn write_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u8) {} + + fn write_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u16) {} + + fn write_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u32) {} + + fn nanos_since_boot(&self) -> u64 { + 0 + } + + fn stall(&self, microseconds: u64) { + for _ in 0..(microseconds.saturating_mul(64)) { + core::hint::spin_loop(); + } + } + + fn sleep(&self, milliseconds: u64) { + for _ in 0..(milliseconds.saturating_mul(64_000)) { + core::hint::spin_loop(); + } + } + + fn create_mutex(&self) -> Handle { + Handle(AML_MUTEX_IDS.fetch_add(1, Ordering::Relaxed)) + } + + fn acquire(&self, _mutex: Handle, _timeout: u16) -> Result<(), acpi_ext::aml::AmlError> { + Ok(()) + } + + fn release(&self, _mutex: Handle) {} +} + +fn sleep_state_name(state: SleepState) -> &'static str { + match state { + SleepState::S3 => "\\_S3", + SleepState::S5 => "\\_S5", + } +} + +fn encode_sleep_type(value: u16) -> u16 { + if value <= 0x7 { + value << ACPI_SLP_TYP_SHIFT + } else { + value & ACPI_SLP_TYP_MASK + } +} + +fn load_interpreter() -> Result<( + Arc>, + PhysicalMapping, + Interpreter, +), SleepError> { + let root = *ACPI_ROOT_INFO.get().ok_or(SleepError::MissingAcpi)?; + let handler = KernelAcpiHandler; + + // SAFETY: ACPI root info is captured from the firmware-provided, already validated root table. + let tables = unsafe { + AcpiTables::from_rsdt(handler, root.revision, root.root_sdt_address.data()) + .map_err(|_| SleepError::MissingAcpi)? + }; + let fadt = tables.find_table::().ok_or(SleepError::MissingFadt)?; + let registers = Arc::new( + FixedRegisters::new(&fadt, handler).map_err(|_| SleepError::UnsupportedPmControl)?, + ); + let facs_address = fadt.facs_address().map_err(|_| SleepError::MissingFacs)?; + + // SAFETY: The FADT-supplied FACS address is used exactly as described by the ACPI spec. + let facs = unsafe { handler.map_physical_region::(facs_address, core::mem::size_of::()) }; + // SAFETY: The AML interpreter only needs an owned mapping of the same firmware FACS table. + let interpreter_facs = unsafe { + handler.map_physical_region::(facs_address, core::mem::size_of::()) + }; + let dsdt = tables.dsdt().map_err(|_| SleepError::MissingFadt)?; + let interpreter = Interpreter::new(handler, dsdt.revision, Arc::clone(®isters), Some(interpreter_facs)); + + // SAFETY: Each AML table mapping is owned by the interpreter during table loading. + unsafe { + let mapping = handler.map_physical_region::(dsdt.phys_address, dsdt.length as usize); + let stream = core::slice::from_raw_parts( + mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::()) as *const u8, + dsdt.length as usize - core::mem::size_of::(), + ); + interpreter + .load_table(stream) + .map_err(|_| SleepError::UnsupportedAmlOperation)?; + + for ssdt in tables.ssdts() { + let mapping = handler.map_physical_region::(ssdt.phys_address, ssdt.length as usize); + let stream = core::slice::from_raw_parts( + mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::()) as *const u8, + ssdt.length as usize - core::mem::size_of::(), + ); + interpreter + .load_table(stream) + .map_err(|_| SleepError::UnsupportedAmlOperation)?; + } + } + + Ok((registers, facs, interpreter)) +} + +fn sleep_type_data_from_interpreter( + interpreter: &Interpreter, + state: SleepState, +) -> Result { + let name = AmlName::from_str(sleep_state_name(state)).map_err(|_| SleepError::MissingSleepObject)?; + let object = interpreter + .evaluate(name, Vec::new()) + .map_err(|_| SleepError::MissingSleepObject)?; + + let Object::Package(package) = &*object else { + return Err(SleepError::InvalidSleepObject); + }; + + let Some(typa_object) = package.first() else { + return Err(SleepError::InvalidSleepObject); + }; + let Some(typb_object) = package.get(1) else { + return Err(SleepError::InvalidSleepObject); + }; + + let Object::Integer(typa) = &**typa_object else { + return Err(SleepError::InvalidSleepObject); + }; + let Object::Integer(typb) = &**typb_object else { + return Err(SleepError::InvalidSleepObject); + }; + + Ok(SleepTypeData { + a: encode_sleep_type(*typa as u16), + b: encode_sleep_type(*typb as u16), + }) +} + +fn sleep_type_data(state: SleepState) -> Result { + let (_registers, _facs, interpreter) = load_interpreter()?; + sleep_type_data_from_interpreter(&interpreter, state) +} + +fn install_wake_trampoline(stack_rsp: usize, cr3: usize) { + let trampoline_page = Page::containing_address(VirtualAddress::new(WAKE_TRAMPOLINE_PHYS)); + let trampoline_frame = PhysicalAddress::new(WAKE_TRAMPOLINE_PHYS); + + // SAFETY: The 0x8000 low-memory trampoline page is reserved by the kernel for bootstrap stubs. + let (result, _) = unsafe { + let mut mapper = KernelMapper::lock_rw(); + let result = mapper + .map_phys( + trampoline_page.start_address(), + trampoline_frame, + PageFlags::new().execute(true).write(true), + ) + .expect("failed to map S3 wake trampoline page"); + (result, mapper.table().phys().data()) + }; + result.flush(); + + for (index, value) in WAKE_TRAMPOLINE_DATA.iter().enumerate() { + // SAFETY: The trampoline page is mapped writable at the same virtual address as the physical page. + unsafe { + core::ptr::write_volatile((WAKE_TRAMPOLINE_PHYS as *mut u8).add(index), *value); + } + } + + // SAFETY: The wake trampoline layout reserves three qword fields immediately after the jump. + unsafe { + let stack_slot = (WAKE_TRAMPOLINE_PHYS + 8) as *mut u64; + let page_table_slot = stack_slot.add(1); + let code_slot = stack_slot.add(2); + stack_slot.write(stack_rsp as u64); + page_table_slot.write(cr3 as u64); + #[expect(clippy::fn_to_numeric_cast)] + code_slot.write(resume_from_s3_trampoline as usize as u64); + } + + // SAFETY: The trampoline mapping is no longer needed once the physical page has been populated. + let (_frame, _, flush) = unsafe { + KernelMapper::lock_rw() + .unmap_phys(trampoline_page.start_address()) + .expect("failed to unmap S3 wake trampoline page") + }; + flush.flush(); +} + +fn save_descriptor_tables(context: &mut SavedCpuContext) { + // SAFETY: SGDT/SIDT only read the current CPU descriptor-table registers into the provided storage. + unsafe { + core::arch::asm!("sgdt [{}]", in(reg) &mut context.gdtr, options(nostack, preserves_flags)); + core::arch::asm!("sidt [{}]", in(reg) &mut context.idtr, options(nostack, preserves_flags)); + } +} + +fn save_fpu_state(context: &mut SavedCpuContext) { + // SAFETY: The kernel owns the current CPU at suspend entry and the FXSAVE buffer is 64-byte aligned. + unsafe { + core::arch::asm!( + "fxsave64 [{}]", + in(reg) context.fpu.bytes.as_mut_ptr(), + ); + } +} + +fn restore_fpu_state(context: &SavedCpuContext) { + // SAFETY: The saved FXSAVE image belongs to the same CPU context and matches the restore instruction. + unsafe { + core::arch::asm!( + "fxrstor64 [{}]", + in(reg) context.fpu.bytes.as_ptr(), + ); + } +} + +fn save_cpu_context(entry_rsp: usize) -> SavedCpuContext { + let mut context = SavedCpuContext { + entry_rsp, + ..SavedCpuContext::default() + }; + + // SAFETY: Reading control registers and MSRs is required to reconstruct the CPU execution state on wake. + unsafe { + core::arch::asm!( + "mov {}, cr0", + out(reg) context.cr0, + options(nostack, preserves_flags) + ); + core::arch::asm!( + "mov {}, cr2", + out(reg) context.cr2, + options(nostack, preserves_flags) + ); + core::arch::asm!( + "mov {}, cr3", + out(reg) context.cr3, + options(nostack, preserves_flags) + ); + core::arch::asm!( + "mov {}, cr4", + out(reg) context.cr4, + options(nostack, preserves_flags) + ); + core::arch::asm!( + "pushfq", + "pop {}", + out(reg) context.rflags, + options(preserves_flags) + ); + core::arch::asm!("mov {}, rsp", out(reg) context.runtime_rsp, options(nostack, preserves_flags)); + + context.efer = x86::msr::rdmsr(x86::msr::IA32_EFER); + context.fs_base = x86::msr::rdmsr(x86::msr::IA32_FS_BASE); + context.gs_base = x86::msr::rdmsr(x86::msr::IA32_GS_BASE); + context.kernel_gs_base = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE); + } + + save_descriptor_tables(&mut context); + save_fpu_state(&mut context); + context +} + +fn set_firmware_waking_vector(facs: &mut PhysicalMapping, vector: usize) { + facs.firmware_waking_vector = vector as u32; + facs.x_firmware_waking_vector = vector as u64; +} + +fn write_pm1_control_block( + registers: &FixedRegisters, + sleep_type: SleepTypeData, +) -> Result<(), SleepError> { + let current_a = registers + .pm1_control_registers + .pm1a + .read() + .map_err(|_| SleepError::UnsupportedPmControl)? as u16; + let armed_a = (current_a & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.a; + + registers + .pm1_control_registers + .pm1a + .write(u64::from(armed_a)) + .map_err(|_| SleepError::UnsupportedPmControl)?; + + if let Some(pm1b) = ®isters.pm1_control_registers.pm1b { + let current_b = pm1b.read().map_err(|_| SleepError::UnsupportedPmControl)? as u16; + let armed_b = (current_b & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.b; + pm1b.write(u64::from(armed_b)) + .map_err(|_| SleepError::UnsupportedPmControl)?; + pm1b.write(u64::from(armed_b | ACPI_SLP_EN)) + .map_err(|_| SleepError::UnsupportedPmControl)?; + } + + // SAFETY: WBINVD is required here to flush dirty cache lines before firmware powers down the CPU package. + unsafe { + core::arch::asm!("wbinvd", options(nostack, preserves_flags)); + } + + registers + .pm1_control_registers + .pm1a + .write(u64::from(armed_a | ACPI_SLP_EN)) + .map_err(|_| SleepError::UnsupportedPmControl)?; + + Ok(()) +} + +#[unsafe(naked)] +unsafe extern "sysv64" fn enter_sleep_raw(state: usize) -> usize { + core::arch::naked_asm!( + "mov rsi, rsp", + "jmp {inner}", + inner = sym enter_sleep_raw_inner, + ); +} + +extern "C" fn enter_sleep_raw_inner(state: usize, entry_rsp: usize) -> usize { + let state = match state { + 3 => SleepState::S3, + 5 => SleepState::S5, + _ => return SleepError::InvalidSleepObject.code(), + }; + + let (registers, mut facs, interpreter) = match load_interpreter() { + Ok(tuple) => tuple, + Err(error) => return error.code(), + }; + let sleep_type = match sleep_type_data_from_interpreter(&interpreter, state) { + Ok(data) => data, + Err(error) => return error.code(), + }; + + let mut context = save_cpu_context(entry_rsp); + context.facs_address = facs.physical_start; + install_wake_trampoline(context.runtime_rsp, context.cr3); + set_firmware_waking_vector(&mut facs, WAKE_TRAMPOLINE_PHYS); + + { + let mut saved = SAVED_CONTEXT.lock(); + *saved = Some(context); + } + + // SAFETY: Suspend entry must not be interrupted while the wake vector and PM1 control block are being armed. + unsafe { + interrupt::disable(); + } + + if let Err(error) = write_pm1_control_block(registers.as_ref(), sleep_type) { + return error.code(); + } + + // SAFETY: The final CLI+HLT sequence is the architectural handoff point after asserting SLP_EN. + unsafe { + core::arch::asm!("cli; hlt", options(nostack)); + } + + SleepError::SleepDidNotEnter.code() +} + +extern "C" fn resume_from_s3_trampoline() -> ! { + let mut saved = SAVED_CONTEXT.lock(); + let context = saved.take().expect("S3 wake trampoline resumed without saved CPU context"); + drop(saved); + + // SAFETY: The saved FACS physical address was captured from the validated FADT during suspend entry. + if context.facs_address != 0 { + let mut facs = unsafe { + KernelAcpiHandler.map_physical_region::( + context.facs_address, + core::mem::size_of::(), + ) + }; + set_firmware_waking_vector(&mut facs, 0); + } + + // SAFETY: The wake trampoline already switched to the saved kernel CR3 and long mode, so the remaining restores are architectural register state only. + unsafe { + x86::msr::wrmsr(x86::msr::IA32_EFER, context.efer); + core::arch::asm!("mov cr3, {}", in(reg) context.cr3, options(nostack)); + core::arch::asm!("mov cr4, {}", in(reg) context.cr4, options(nostack)); + core::arch::asm!("mov cr2, {}", in(reg) context.cr2, options(nostack)); + core::arch::asm!("mov cr0, {}", in(reg) context.cr0, options(nostack)); + core::arch::asm!("lgdt [{}]", in(reg) &context.gdtr, options(nostack)); + core::arch::asm!("lidt [{}]", in(reg) &context.idtr, options(nostack)); + + task::load_tr(SegmentSelector::new(crate::arch::gdt::GDT_TSS as u16, Ring::Ring0)); + + x86::msr::wrmsr(x86::msr::IA32_FS_BASE, context.fs_base); + x86::msr::wrmsr(x86::msr::IA32_GS_BASE, context.gs_base); + x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, context.kernel_gs_base); + } + + restore_fpu_state(&context); + + // SAFETY: Returning with the original entry stack and RFLAGS completes the suspend call as a successful function return. + unsafe { + core::arch::asm!( + "mov rsp, {entry_rsp}", + "push {rflags}", + "popfq", + "xor eax, eax", + "ret", + entry_rsp = in(reg) context.entry_rsp, + rflags = in(reg) context.rflags, + options(noreturn) + ); + } +} + +pub fn enter_sleep_state(state: SleepState) -> core::result::Result<(), SleepError> { + #[cfg(not(target_arch = "x86_64"))] + { + let _ = state; + return Err(SleepError::UnsupportedArch); + } + + #[cfg(target_arch = "x86_64")] + { + let raw = unsafe { + enter_sleep_raw(match state { + SleepState::S3 => 3, + SleepState::S5 => 5, + }) + }; + if raw == SLEEP_RETURN_OK { + Ok(()) + } else { + Err(SleepError::from_code(raw)) + } + } +} + +pub fn available_sleep_states() -> &'static [u8] { + if sleep_type_data(SleepState::S3).is_ok() { + b"S3\nS5\n" + } else { + b"S5\n" + } +} + +pub fn trigger_sleep_request(request: &str) -> Result<(), Error> { + match request.trim() { + "S3" => enter_sleep_state(SleepState::S3).map_err(|_| Error::new(EIO)), + "S5" => enter_sleep_state(SleepState::S5).map_err(|_| Error::new(EIO)), + _ => Err(Error::new(EINVAL)), + } +} diff --git a/recipes/core/kernel/source/src/arch/x86_shared/start.rs b/recipes/core/kernel/source/src/arch/x86_shared/start.rs index 7a7c0ae815..cf3e433bee 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/start.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/start.rs @@ -82,6 +82,15 @@ extern "C" fn kstart() { /// The entry to Rust, all things must be initialized unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! { unsafe { + // EARLY CANARY: write 'R' to COM1 before any kernel init. + // This proves the serial hardware works and the kernel reached Rust entry. + // If this character appears but "RedBear OS starting..." does not, + // the hang is in args_ptr.read(), serial::init(), or graphical_debug::init(). + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'R', options(nostack, preserves_flags)); + } + let bootstrap = { let args = args_ptr.read(); @@ -91,27 +100,49 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! { // Set up graphical debug graphical_debug::init(args.env()); - info!("Redox OS starting..."); + // SECOND CANARY: write 'S' to COM1 after serial init. + // If 'R' appears but 'S' does not, the hang is in serial::init() or graphical_debug::init(). + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'S', options(nostack, preserves_flags)); + } + + info!("RedBear OS starting..."); args.print(); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'1', options(nostack, preserves_flags)); } + // Set up GDT gdt::init_bsp(stack_end); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'2', options(nostack, preserves_flags)); } + // Set up IDT idt::init_bsp(); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'3', options(nostack, preserves_flags)); } + // Initialize RMM #[cfg(target_arch = "x86")] crate::startup::memory::init(&args, Some(0x100000), Some(0x40000000)); #[cfg(target_arch = "x86_64")] crate::startup::memory::init(&args, Some(0x100000), None); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'4', options(nostack, preserves_flags)); } + // Initialize paging paging::init(); #[cfg(target_arch = "x86_64")] crate::arch::alternative::early_init(true); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'5', options(nostack, preserves_flags)); } + // Set up syscall instruction interrupt::syscall::init(); @@ -121,6 +152,9 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! { // Activate memory logging crate::log::init(); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'6', options(nostack, preserves_flags)); } + // Initialize miscellaneous processor features #[cfg(target_arch = "x86_64")] crate::arch::misc::init(LogicalCpuId::BSP); @@ -128,6 +162,9 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! { // Initialize devices device::init(); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'7', options(nostack, preserves_flags)); } + // Read ACPI tables, starts APs if cfg!(feature = "acpi") { crate::acpi::init(args.acpi_rsdp()); diff --git a/recipes/core/kernel/source/src/asm/x86_64/s3_wakeup.asm b/recipes/core/kernel/source/src/asm/x86_64/s3_wakeup.asm new file mode 100644 index 0000000000..7beeccf603 --- /dev/null +++ b/recipes/core/kernel/source/src/asm/x86_64/s3_wakeup.asm @@ -0,0 +1,110 @@ +; ACPI S3 wake trampoline +; compiled with nasm by build.rs, copied to physical 0x8000 before S3 entry + +ORG 0x8000 +SECTION .text +USE16 + +trampoline: + jmp short startup_wake + times 8 - ($ - trampoline) nop + .stack: dq 0 + .page_table: dq 0 + .code: dq 0 + +startup_wake: + cli + + xor ax, ax + mov ds, ax + mov es, ax + mov ss, ax + mov sp, 0 + + mov edi, [trampoline.page_table] + mov cr3, edi + + mov eax, cr0 + and al, 11110011b + or al, 00100010b + mov cr0, eax + + mov eax, cr4 + or eax, 1 << 9 | 1 << 7 | 1 << 5 | 1 << 4 + mov cr4, eax + + fninit + + lgdt [gdtr] + + mov ecx, 0xC0000080 + rdmsr + or eax, 1 << 11 | 1 << 8 + wrmsr + + mov ebx, cr0 + or ebx, 1 << 31 | 1 << 16 | 1 + mov cr0, ebx + + jmp gdt.kernel_code:long_mode_wake + +USE64 +long_mode_wake: + mov rax, gdt.kernel_data + mov ds, rax + mov es, rax + mov fs, rax + mov gs, rax + mov ss, rax + + mov rsp, [trampoline.stack] + mov rax, [trampoline.code] + jmp rax + +struc GDTEntry + .limitl resw 1 + .basel resw 1 + .basem resb 1 + .attribute resb 1 + .flags__limith resb 1 + .baseh resb 1 +endstruc + +attrib: + .present equ 1 << 7 + .user equ 1 << 4 + .code equ 1 << 3 + .writable equ 1 << 1 + +flags: + .long_mode equ 1 << 5 + +gdtr: + dw gdt.end + 1 + dq gdt + +gdt: +.null equ $ - gdt + dq 0 + +.kernel_code equ $ - gdt +istruc GDTEntry + at GDTEntry.limitl, dw 0 + at GDTEntry.basel, dw 0 + at GDTEntry.basem, db 0 + at GDTEntry.attribute, db attrib.present | attrib.user | attrib.code + at GDTEntry.flags__limith, db flags.long_mode + at GDTEntry.baseh, db 0 +iend + +.kernel_data equ $ - gdt +istruc GDTEntry + at GDTEntry.limitl, dw 0 + at GDTEntry.basel, dw 0 + at GDTEntry.basem, db 0 + at GDTEntry.attribute, db attrib.present | attrib.user | attrib.writable + at GDTEntry.flags__limith, db 0 + at GDTEntry.baseh, db 0 +iend + +.end equ $ - gdt diff --git a/recipes/core/kernel/source/src/context/arch/aarch64.rs b/recipes/core/kernel/source/src/context/arch/aarch64.rs index 33dc83a987..b8f8ac95d7 100644 --- a/recipes/core/kernel/source/src/context/arch/aarch64.rs +++ b/recipes/core/kernel/source/src/context/arch/aarch64.rs @@ -4,16 +4,10 @@ use crate::{ percpu::PercpuBlock, syscall::FloatRegisters, }; -use core::{mem::offset_of, ptr, sync::atomic::AtomicBool}; +use core::{mem::offset_of, ptr}; use spin::Once; use syscall::{EnvRegisters, Result}; -/// This must be used by the kernel to ensure that context switches are done atomically -/// Compare and exchange this to true when beginning a context switch on any CPU -/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch -/// This must be done, as no locks can be held on the stack during switch -pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); - // 512 bytes for registers, extra bytes for fpcr and fpsr pub const KFX_ALIGN: usize = 16; diff --git a/recipes/core/kernel/source/src/context/arch/riscv64.rs b/recipes/core/kernel/source/src/context/arch/riscv64.rs index 4bd843e620..fe63639acb 100644 --- a/recipes/core/kernel/source/src/context/arch/riscv64.rs +++ b/recipes/core/kernel/source/src/context/arch/riscv64.rs @@ -2,13 +2,11 @@ use crate::{ arch::interrupt::InterruptStack, context::context::Kstack, memory::RmmA, percpu::PercpuBlock, syscall::FloatRegisters, }; -use core::{mem::offset_of, sync::atomic::AtomicBool}; +use core::mem::offset_of; use rmm::{Arch, VirtualAddress}; use spin::Once; use syscall::{error::*, EnvRegisters}; -pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); - pub const KFX_ALIGN: usize = 16; #[derive(Clone, Debug, Default)] diff --git a/recipes/core/kernel/source/src/context/arch/x86.rs b/recipes/core/kernel/source/src/context/arch/x86.rs index 2862d35f20..dc01f6e707 100644 --- a/recipes/core/kernel/source/src/context/arch/x86.rs +++ b/recipes/core/kernel/source/src/context/arch/x86.rs @@ -1,4 +1,4 @@ -use core::{mem::offset_of, sync::atomic::AtomicBool}; +use core::mem::offset_of; use rmm::{Arch, VirtualAddress}; use spin::Once; use syscall::{error::*, EnvRegisters}; @@ -14,12 +14,6 @@ use crate::{ syscall::FloatRegisters, }; -/// This must be used by the kernel to ensure that context switches are done atomically -/// Compare and exchange this to true when beginning a context switch on any CPU -/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch -/// This must be done, as no locks can be held on the stack during switch -pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); - const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000; pub const KFX_ALIGN: usize = 16; diff --git a/recipes/core/kernel/source/src/context/arch/x86_64.rs b/recipes/core/kernel/source/src/context/arch/x86_64.rs index 6758c9fca5..574d373887 100644 --- a/recipes/core/kernel/source/src/context/arch/x86_64.rs +++ b/recipes/core/kernel/source/src/context/arch/x86_64.rs @@ -1,6 +1,5 @@ use core::{ ptr::{addr_of, addr_of_mut}, - sync::atomic::AtomicBool, }; use crate::syscall::FloatRegisters; @@ -12,12 +11,6 @@ use spin::Once; use syscall::{error::*, EnvRegisters}; use x86::msr; -/// This must be used by the kernel to ensure that context switches are done atomically -/// Compare and exchange this to true when beginning a context switch on any CPU -/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch -/// This must be done, as no locks can be held on the stack during switch -pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); - const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000; #[cfg(cpu_feature_never = "xsave")] diff --git a/recipes/core/kernel/source/src/context/context.rs b/recipes/core/kernel/source/src/context/context.rs index c97c5166be..6d723f498f 100644 --- a/recipes/core/kernel/source/src/context/context.rs +++ b/recipes/core/kernel/source/src/context/context.rs @@ -148,6 +148,8 @@ pub struct Context { pub euid: u32, pub egid: u32, pub pid: usize, + /// Supplementary group IDs for access control decisions. + pub groups: Vec, // See [`PreemptGuard`] // @@ -204,6 +206,7 @@ impl Context { euid: 0, egid: 0, pid: 0, + groups: Vec::new(), #[cfg(feature = "syscall_debug")] syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(), @@ -479,6 +482,7 @@ impl Context { uid: self.euid, gid: self.egid, pid: self.pid, + groups: self.groups.clone(), } } } diff --git a/recipes/core/kernel/source/src/context/file.rs b/recipes/core/kernel/source/src/context/file.rs index 2d3790f147..150f483a47 100644 --- a/recipes/core/kernel/source/src/context/file.rs +++ b/recipes/core/kernel/source/src/context/file.rs @@ -4,7 +4,7 @@ use crate::{ event, scheme::{self, SchemeId}, sync::{CleanLockToken, RwLock, L6}, - syscall::error::Result, + syscall::error::{Error, Result, ESTALE}, }; use alloc::sync::Arc; use syscall::{schemev2::NewFdFlags, RwFlags, O_APPEND, O_NONBLOCK}; @@ -18,6 +18,7 @@ pub struct FileDescription { pub offset: u64, /// The scheme that this file refers to pub scheme: SchemeId, + pub scheme_generation: Option, /// The number the scheme uses to refer to this file pub number: usize, /// The flags passed to open or fcntl(SETFL) @@ -32,6 +33,52 @@ bitflags! { } } impl FileDescription { + pub fn with_generation( + scheme: SchemeId, + scheme_generation: Option, + number: usize, + offset: u64, + flags: u32, + internal_flags: InternalFlags, + ) -> Self { + Self { + offset, + scheme, + scheme_generation, + number, + flags, + internal_flags, + } + } + + pub fn new( + scheme: SchemeId, + number: usize, + offset: u64, + flags: u32, + internal_flags: InternalFlags, + token: &mut CleanLockToken, + ) -> Self { + Self::with_generation( + scheme, + Some(scheme::current_scheme_generation(token.token(), scheme)), + number, + offset, + flags, + internal_flags, + ) + } + + pub fn get_scheme(&self, token: &mut CleanLockToken) -> Result { + if let Some(expected_generation) = self.scheme_generation + && expected_generation != scheme::current_scheme_generation(token.token(), self.scheme) + { + return Err(Error::new(ESTALE)); + } + + scheme::get_scheme(token.token(), self.scheme) + } + pub fn rw_flags(&self, rw: RwFlags) -> u32 { let mut ret = self.flags & !(O_NONBLOCK | O_APPEND) as u32; if rw.contains(RwFlags::APPEND) { @@ -76,7 +123,7 @@ impl FileDescription { pub fn try_close(self, token: &mut CleanLockToken) -> Result<()> { event::unregister_file(self.scheme, self.number, token); - let scheme = scheme::get_scheme(token.token(), self.scheme)?; + let scheme = self.get_scheme(token)?; scheme.close(self.number, token) } @@ -85,12 +132,12 @@ impl FileDescription { impl FileDescriptor { pub fn close(self, token: &mut CleanLockToken) -> Result<()> { { - let (scheme_id, number, internal_flags) = { + let (desc, number, internal_flags) = { let desc = self.description.read(token.token()); - (desc.scheme, desc.number, desc.internal_flags) + (*desc, desc.number, desc.internal_flags) }; if internal_flags.contains(InternalFlags::NOTIFY_ON_NEXT_DETACH) { - let scheme = scheme::get_scheme(token.token(), scheme_id)?; + let scheme = desc.get_scheme(token)?; scheme.detach(number, token)?; } } diff --git a/recipes/core/kernel/source/src/context/memory.rs b/recipes/core/kernel/source/src/context/memory.rs index 93446ba7a7..127a34fd87 100644 --- a/recipes/core/kernel/source/src/context/memory.rs +++ b/recipes/core/kernel/source/src/context/memory.rs @@ -64,14 +64,13 @@ impl UnmapResult { return Ok(()); }; - let (scheme_id, number) = { - let desc = description.write(token.token()); - (desc.scheme, desc.number) + let (scheme, number) = { + let desc = *description.read(token.token()); + (desc.get_scheme(token)?, desc.number) }; - let scheme_opt = scheme::get_scheme(token.token(), scheme_id); - let funmap_result = scheme_opt - .and_then(|scheme| scheme.kfunmap(number, base_offset, self.size, self.flags, token)); + let funmap_result = scheme + .kfunmap(number, base_offset, self.size, self.flags, token); if let Ok(fd) = Arc::try_unwrap(description) { fd.into_inner().try_close(token)?; @@ -2687,20 +2686,13 @@ fn correct_inner<'l>( // XXX: This is cheating, but guaranteed we won't deadlock because we've dropped addr_space_guard let mut token = unsafe { CleanLockToken::new() }; - let (scheme_id, scheme_number) = { - let desc = &file_ref.description.read(token.token()); - (desc.scheme, desc.number) + let desc = *file_ref.description.read(token.token()); + let scheme = desc.get_scheme(&mut token).map_err(|_| PfError::Segv)?; + let scheme_number = desc.number; + let user_inner = match scheme { + KernelSchemes::User(user) => user.inner, + _ => return Err(PfError::Segv), }; - let user_inner = scheme::get_scheme(token.token(), scheme_id) - .ok() - .and_then(|s| { - if let KernelSchemes::User(user) = s { - Some(user.inner) - } else { - None - } - }) - .ok_or(PfError::Segv)?; let offset = file_ref.base_offset as u64 + (pages_from_grant_start * PAGE_SIZE) as u64; user_inner diff --git a/recipes/core/kernel/source/src/context/mod.rs b/recipes/core/kernel/source/src/context/mod.rs index 37c73f5a37..df44cc4565 100644 --- a/recipes/core/kernel/source/src/context/mod.rs +++ b/recipes/core/kernel/source/src/context/mod.rs @@ -14,8 +14,8 @@ use crate::{ memory::{RmmA, RmmArch, TableKind}, percpu::PercpuBlock, sync::{ - ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard, - RwLockWriteGuard, L0, L1, L2, L4, + ArcRwLockWriteGuard, CleanLockToken, LockToken, McsMutex, McsMutexGuard, Mutex, + MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, L0, L1, L2, L4, }, syscall::error::Result, }; @@ -74,10 +74,12 @@ pub use self::arch::empty_cr3; // the context file descriptors. static CONTEXTS: RwLock> = RwLock::new(BTreeSet::new()); -// Actual context store for the scheduler -static RUN_CONTEXTS: Mutex = Mutex::new(RunContextData::new()); +// Actual context store for the scheduler — uses MCS fair spinlock to +// eliminate cache-line bouncing under multi-CPU contention. +static RUN_CONTEXTS: McsMutex = McsMutex::new(RunContextData::new()); -// Context that has been pushed out from RUN_CONTEXTS after being idle +// Context that has been pushed out from RUN_CONTEXTS after being idle. +// Uses regular Mutex (lower contention; wakeup_contexts uses try_lock). static IDLE_CONTEXTS: Mutex> = Mutex::new(VecDeque::new()); pub struct RunContextData { @@ -113,7 +115,7 @@ pub fn idle_contexts_try( IDLE_CONTEXTS.try_lock(token) } -pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextData> { +pub fn run_contexts(token: LockToken<'_, L0>) -> McsMutexGuard<'_, L1, RunContextData> { RUN_CONTEXTS.lock(token) } diff --git a/recipes/core/kernel/source/src/context/switch.rs b/recipes/core/kernel/source/src/context/switch.rs index 86684c8f4c..2dbed065eb 100644 --- a/recipes/core/kernel/source/src/context/switch.rs +++ b/recipes/core/kernel/source/src/context/switch.rs @@ -15,7 +15,7 @@ use crate::{ use alloc::{sync::Arc, vec::Vec}; use core::{ cell::{Cell, RefCell}, - hint, mem, + mem, sync::atomic::Ordering, }; use syscall::PtraceFlags; @@ -26,6 +26,11 @@ enum UpdateResult { Blocked, } +/// Default number of PIT ticks before triggering a context switch. +/// At ~2.25 ms per tick, 3 ticks ≈ 6.75 ms timeslice. +/// Configurable per-CPU via `ContextSwitchPercpu::preempt_interval`. +const DEFAULT_PREEMPT_INTERVAL: usize = 3; + // A simple geometric series where value[i] ~= value[i - 1] * 1.25 const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [ 88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, 9548, 7620, 6100, 4904, @@ -90,13 +95,15 @@ struct SwitchResultInner { /// /// The function also calls the signal handler after switching contexts. pub fn tick(token: &mut CleanLockToken) { - let ticks_cell = &PercpuBlock::current().switch_internals.pit_ticks; + let percpu = PercpuBlock::current(); + let ticks_cell = &percpu.switch_internals.pit_ticks; let new_ticks = ticks_cell.get() + 1; ticks_cell.set(new_ticks); - // Trigger a context switch after every 3 ticks (approx. 6.75 ms). - if new_ticks >= 3 { + // Trigger a context switch when the per-CPU preempt interval is reached. + let interval = percpu.switch_internals.preempt_interval.get(); + if new_ticks >= interval { switch(token); crate::context::signal::signal_handler(token); } @@ -120,7 +127,10 @@ pub unsafe extern "C" fn switch_finish_hook() { crate::arch::stop::emergency_reset(); } } - arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); + PercpuBlock::current() + .switch_internals + .in_context_switch + .set(false); crate::percpu::switch_arch_hook(); } } @@ -150,16 +160,15 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { //set PIT Interrupt counter to 0, giving each process same amount of PIT ticks percpu.switch_internals.pit_ticks.set(0); - // Acquire the global lock to ensure exclusive access during context switch and avoid - // issues that would be caused by the unsafe operations below - // TODO: Better memory orderings? - while arch::CONTEXT_SWITCH_LOCK - .compare_exchange_weak(false, true, Ordering::SeqCst, Ordering::Relaxed) - .is_err() - { - hint::spin_loop(); - percpu.maybe_handle_tlb_shootdown(); - } + // Acquire the per-CPU context switch flag. Each CPU can only be in one context + // switch at a time. The per-context write locks provide cross-CPU safety; this + // flag catches re-entrant switches on the same CPU (a kernel bug). + debug_assert!( + !percpu.switch_internals.in_context_switch.get(), + "context switch re-entry on CPU {}", + percpu.cpu_id + ); + percpu.switch_internals.in_context_switch.set(true); // Lock the previous context. let prev_context_lock = crate::context::current(); @@ -167,8 +176,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { let mut prev_context_guard = unsafe { prev_context_lock.write_arc() }; if !prev_context_guard.is_preemptable() { - // Unset global lock - arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); + // Unset per-CPU context switch flag + percpu.switch_internals.in_context_switch.set(false); // Pretend to have finished switching, so CPU is not idled return SwitchResult::Switched; @@ -292,8 +301,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { SwitchResult::Switched } _ => { - // No target was found, unset global lock and return - arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); + // No target was found, unset per-CPU context switch flag and return + percpu.switch_internals.in_context_switch.set(false); percpu.stats.set_state(cpu_stats::CpuState::Idle); @@ -352,6 +361,7 @@ fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize, } /// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler +/// with NUMA-aware context selection preference. fn select_next_context( token: &mut CleanLockToken, percpu: &PercpuBlock, @@ -377,6 +387,10 @@ fn select_next_context( let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum(); let mut skipped_contexts = 0; + // NUMA-aware selection: remember cross-node fallback candidate. + let my_numa_node = percpu.numa_node.get(); + let mut cross_node_fallback: Option<(usize, ArcContextLockWriteGuard)> = None; + 'priority: loop { i = (i + 1) % 40; total_iters += 1; @@ -441,9 +455,44 @@ fn select_next_context( // Is this context runnable on this CPU? let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) }; if let UpdateResult::CanSwitch = sw { - next_context_guard_opt = Some(next_context_guard); - balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; - break 'priority; + // NUMA-aware selection: check if this context's last CPU was on the same node. + let same_node = if my_numa_node != u8::MAX { + next_context_guard.cpu_id + .map(|cid| { + crate::percpu::get_for_cpu(cid) + .map(|p| p.numa_node.get() == my_numa_node) + .unwrap_or(false) + }) + .unwrap_or(true) // New context (no last CPU) — treat as same node + } else { + true // No NUMA info — treat all as same node + }; + + if same_node { + // Cache-warm: select immediately + percpu.current_prio.set(next_context_guard.prio); + next_context_guard_opt = Some(next_context_guard); + balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; + break 'priority; + } else { + // Cross-node candidate: save as fallback, keep scanning for same-node + if cross_node_fallback.is_none() { + // Cache the priority and balance for later + cross_node_fallback = + Some((next_context_guard.prio, next_context_guard)); + balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; + // Don't break — keep looking for a same-node context + continue; + } else { + // Already have a cross-node fallback; push this one back + contexts.push_back(next_context_ref); + skipped_contexts += 1; + if skipped_contexts >= total_contexts { + break 'priority; + } + continue; + } + } } else { if matches!(sw, UpdateResult::Blocked) { idle_contexts(token.token()).push_back(next_context_ref); @@ -458,6 +507,15 @@ fn select_next_context( } } } + + // If we found a cross-node fallback but no same-node context, use it + if next_context_guard_opt.is_none() { + if let Some((prio, guard)) = cross_node_fallback { + percpu.current_prio.set(prio); + next_context_guard_opt = Some(guard); + } + } + percpu.balance.set(balance); percpu.last_queue.set(i); @@ -465,7 +523,10 @@ fn select_next_context( // Send the old process to the back of the line (if it is still runnable) let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock)); if prev_context_guard.status.is_runnable() { - let prio = prev_context_guard.prio; + let raw_prio = prev_context_guard.prio; + let prio = percpu.effective_prio(raw_prio); + // Clear PI donation — previous context is being re-queued + percpu.pi_donated_prio.store(u32::MAX, Ordering::Relaxed); contexts_list[prio].push_back(prev_ctx); } else { idle_contexts(token.token()).push_back(prev_ctx); @@ -477,7 +538,8 @@ fn select_next_context( return Ok(Some(next_context_guard)); } else { if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) { - // We switch into the idle context + // Switching to idle context — cache lowest priority + percpu.current_prio.set(39); Ok(Some(unsafe { idle_context.write_arc() })) } else { // We found no other process to run. @@ -494,6 +556,13 @@ pub struct ContextSwitchPercpu { switch_result: Cell>, switch_time: Cell, pit_ticks: Cell, + /// Per-CPU context switch flag. Set to true during a context switch on this CPU. + /// Replaced the global CONTEXT_SWITCH_LOCK to eliminate cross-CPU serialization. + in_context_switch: Cell, + /// Number of PIT ticks before triggering a context switch. + /// Default: 3 (≈6.75 ms). Lower values improve interactive responsiveness; + /// higher values improve throughput for batch/compute workloads. + preempt_interval: Cell, current_ctxt: RefCell>>, @@ -508,6 +577,8 @@ impl ContextSwitchPercpu { switch_result: Cell::new(None), switch_time: Cell::new(0), pit_ticks: Cell::new(0), + in_context_switch: Cell::new(false), + preempt_interval: Cell::new(DEFAULT_PREEMPT_INTERVAL), current_ctxt: RefCell::new(None), idle_ctxt: RefCell::new(None), being_sigkilled: Cell::new(false), diff --git a/recipes/core/kernel/source/src/cpu_set.rs b/recipes/core/kernel/source/src/cpu_set.rs index 4aae7781e9..5594cac082 100644 --- a/recipes/core/kernel/source/src/cpu_set.rs +++ b/recipes/core/kernel/source/src/cpu_set.rs @@ -42,17 +42,18 @@ impl core::fmt::Display for LogicalCpuId { } #[cfg(target_pointer_width = "64")] -pub const MAX_CPU_COUNT: u32 = 128; +pub const MAX_CPU_COUNT: u32 = 256; #[cfg(target_pointer_width = "32")] pub const MAX_CPU_COUNT: u32 = 32; const SET_WORDS: usize = (MAX_CPU_COUNT / usize::BITS) as usize; -// TODO: Support more than 128 CPUs. +// TODO: Support more than 256 CPUs. // The maximum number of CPUs on Linux is configurable, and the type for LogicalCpuSet and // LogicalCpuId may be optimized accordingly. In that case, box the mask if it's larger than some -// base size (probably 256 bytes). +// base size (probably 256 bytes). AMD EPYC has 128C/256T, Threadripper PRO 96C/192T — +// 256 covers current hardware. #[derive(Debug)] pub struct LogicalCpuSet([AtomicUsize; SET_WORDS]); diff --git a/recipes/core/kernel/source/src/event.rs b/recipes/core/kernel/source/src/event.rs index 7398145ad6..f4f57c2351 100644 --- a/recipes/core/kernel/source/src/event.rs +++ b/recipes/core/kernel/source/src/event.rs @@ -1,5 +1,5 @@ use alloc::sync::Arc; -use core::sync::atomic::{AtomicUsize, Ordering}; +use core::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use hashbrown::{hash_map::DefaultHashBuilder, HashMap}; use smallvec::SmallVec; use syscall::data::GlobalSchemes; @@ -23,6 +23,7 @@ int_like!(EventQueueId, AtomicEventQueueId, usize, AtomicUsize); pub struct EventQueue { id: EventQueueId, queue: WaitQueue, + pub eventfd: Option<(AtomicU64, bool)>, // (counter, semaphore_mode) } impl EventQueue { @@ -30,6 +31,15 @@ impl EventQueue { EventQueue { id, queue: WaitQueue::new(), + eventfd: None, + } + } + + pub fn new_eventfd(id: EventQueueId, initval: u64, semaphore: bool) -> EventQueue { + EventQueue { + id, + queue: WaitQueue::new(), + eventfd: Some((AtomicU64::new(initval), semaphore)), } } diff --git a/recipes/core/kernel/source/src/main.rs b/recipes/core/kernel/source/src/main.rs index 32f491d0e8..81487fac89 100644 --- a/recipes/core/kernel/source/src/main.rs +++ b/recipes/core/kernel/source/src/main.rs @@ -70,6 +70,9 @@ mod log; /// Memory management mod memory; +/// NUMA topology +mod numa; + /// Panic mod panic; diff --git a/recipes/core/kernel/source/src/numa.rs b/recipes/core/kernel/source/src/numa.rs new file mode 100644 index 0000000000..cba73a4465 --- /dev/null +++ b/recipes/core/kernel/source/src/numa.rs @@ -0,0 +1,81 @@ +/// NUMA topology hints for the kernel scheduler. +/// +/// NUMA discovery (SRAT/SLIT parsing) is performed during kernel ACPI init +/// (`acpi::init()`). The kernel stores a lightweight copy for O(1) scheduling +/// lookups. If no SRAT is found, `init_default()` creates a single-node topology. +use crate::acpi::srat; +use crate::cpu_set::{LogicalCpuId, LogicalCpuSet}; +use core::sync::atomic::{AtomicBool, Ordering}; + +const MAX_NUMA_NODES: usize = 8; + +#[derive(Debug)] +pub struct NumaHint { + pub node_id: u8, + pub cpus: LogicalCpuSet, +} + +pub struct NumaTopology { + pub nodes: [Option; MAX_NUMA_NODES], + pub initialized: AtomicBool, +} + +impl NumaTopology { + pub const fn new() -> Self { + const NONE: Option = None; + Self { nodes: [NONE; MAX_NUMA_NODES], initialized: AtomicBool::new(false) } + } + + pub fn node_for_cpu(&self, cpu: LogicalCpuId) -> Option { + for node in self.nodes.iter().flatten() { + if node.cpus.contains(cpu) { return Some(node.node_id); } + } + None + } + + pub fn same_node(&self, cpu1: LogicalCpuId, cpu2: LogicalCpuId) -> bool { + self.node_for_cpu(cpu1) == self.node_for_cpu(cpu2) + } +} + +static mut NUMA_TOPOLOGY: NumaTopology = NumaTopology::new(); + +pub fn topology() -> &'static NumaTopology { unsafe { &NUMA_TOPOLOGY } } + +/// Initialize NUMA topology from SRAT data parsed during ACPI init. +pub fn init_from_srat(apic_ids: &[(u32, LogicalCpuId)]) { + let topo = topology(); + if topo.initialized.swap(true, Ordering::AcqRel) { return; } + if !srat::is_available() { init_default_inner(); return; } + unsafe { + let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY); + for &(apic_id, cpu_id) in apic_ids { + if let Some(node) = srat::numa_node_for_apic(apic_id) { + let idx = node as usize; + if idx < MAX_NUMA_NODES { + topo_mut.nodes[idx].get_or_insert_with(|| NumaHint { node_id: node, cpus: LogicalCpuSet::empty() }).cpus.atomic_set(cpu_id); + } + } + } + if topo_mut.nodes.iter().all(|n| n.is_none()) { + topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() }); + } + } + let node_count = topology().nodes.iter().filter(|n| n.is_some()).count(); + debug!("NUMA: {node_count} node(s) from SRAT"); +} + +/// Fallback: single-node topology. +pub fn init_default() { + let topo = topology(); + if topo.initialized.swap(true, Ordering::AcqRel) { return; } + init_default_inner(); +} + +fn init_default_inner() { + unsafe { + let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY); + topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() }); + } + debug!("NUMA: single-node topology (no SRAT)"); +} diff --git a/recipes/core/kernel/source/src/percpu.rs b/recipes/core/kernel/source/src/percpu.rs index f4ad5e66e6..9309a41d4d 100644 --- a/recipes/core/kernel/source/src/percpu.rs +++ b/recipes/core/kernel/source/src/percpu.rs @@ -4,9 +4,14 @@ use alloc::{ }; use core::{ cell::{Cell, RefCell}, - sync::atomic::{AtomicBool, AtomicPtr, Ordering}, + hint, + sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering}, }; +/// Maximum number of pages to flush individually using INVLPG before falling +/// back to a full TLB flush (CR3 reload). +const TLB_RANGE_THRESHOLD: u32 = 32; + use rmm::Arch; use syscall::PtraceFlags; @@ -16,7 +21,7 @@ use crate::{ cpu_set::{LogicalCpuId, MAX_CPU_COUNT}, cpu_stats::{CpuStats, CpuStatsData}, ptrace::Session, - sync::CleanLockToken, + sync::{mcs::McsNode, mcs::McsRawLock, CleanLockToken}, syscall::debug::SyscallDebugInfo, }; @@ -34,6 +39,38 @@ pub struct PercpuBlock { pub balance: Cell<[usize; 40]>, pub last_queue: Cell, + /// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS). + pub mcs_sched_node: McsNode, + + /// Counts how many times the scheduler MCS lock acquisition was contended. + pub mcs_contention_count: Cell, + + /// TLB shootdown range: start virtual address (page-aligned). + /// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true. + pub tlb_flush_start: AtomicU64, + /// TLB shootdown range: number of pages to invalidate. + pub tlb_flush_count: AtomicU32, + + /// Priority inheritance donation. When another CPU is blocked waiting on a + /// lock this CPU holds, the blocked CPU may donate its priority here. + /// `u32::MAX` means no donation; otherwise it's a priority level (0-39). + pub pi_donated_prio: AtomicU32, + + /// Cached priority of the currently-running context on this CPU. + /// Set by the scheduler when selecting a new context. Read by the MCS + /// lock during priority donation — avoids acquiring the context RwLock + /// from the spin loop. Default 39 (lowest priority). + pub current_prio: Cell, + + /// NUMA proximity domain for this CPU. Set during ACPI init from SRAT. + /// `u8::MAX` means unknown (no SRAT or APIC ID not listed). + pub numa_node: Cell, + + /// Pointer to the MCS lock this CPU is currently spinning on (for transitive PI). + /// `null` when not waiting on any lock. Set in McsRawLock::acquire() before + /// entering the spin loop, cleared upon acquisition. + pub waiting_on_lock: AtomicPtr, + // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it // first to avoid cache invalidation. pub profiling: Option<&'static crate::profiling::RingBuffer>, @@ -57,6 +94,15 @@ pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) { ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release) } +/// Get a reference to another CPU's PercpuBlock by logical CPU ID. +pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> { + unsafe { + ALL_PERCPU_BLOCKS[id.get() as usize] + .load(Ordering::Acquire) + .as_ref() + } +} + pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> { let mut res = ALL_PERCPU_BLOCKS .iter() @@ -101,25 +147,148 @@ pub fn shootdown_tlb_ipi(target: Option) { core::hint::spin_loop(); } } + // Full flush — clear range info (Release ordering ensures the flag + // swap and these stores are visible to the handler before the IPI). + percpublock.tlb_flush_start.store(0, Ordering::Release); + percpublock.tlb_flush_count.store(0, Ordering::Release); crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock); } else { + // Broadcast TLB shootdown: set flag on all other CPUs, then send a single + // IPI with "all except self" destination shorthand instead of N individual IPIs. + let my_percpublock = PercpuBlock::current(); for id in 0..crate::cpu_count() { - // TODO: Optimize: use global counter and percpu ack counters, send IPI using - // destination shorthand "all CPUs". - shootdown_tlb_ipi(Some(LogicalCpuId::new(id))); + let target_id = LogicalCpuId::new(id); + if target_id == my_percpublock.cpu_id { + continue; + } + let Some(percpublock) = (unsafe { + ALL_PERCPU_BLOCKS[id as usize] + .load(Ordering::Acquire) + .as_ref() + }) else { + continue; + }; + // Wait if this CPU still has a pending shootdown from a previous request + #[expect(clippy::bool_comparison)] + while percpublock + .wants_tlb_shootdown + .swap(true, Ordering::Release) + == true + { + while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { + my_percpublock.maybe_handle_tlb_shootdown(); + hint::spin_loop(); + } + } + // Full flush — clear range info (Release ordering) + percpublock.tlb_flush_start.store(0, Ordering::Release); + percpublock.tlb_flush_count.store(0, Ordering::Release); } + // Single broadcast IPI to all other CPUs using destination shorthand + crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other); + } +} + +/// Range-based TLB shootdown IPI. Only invalidates the specified virtual address +/// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages. +/// Falls back to full flush for larger ranges. +pub fn shootdown_tlb_ipi_range(target: Option, start: usize, count: usize) { + if cfg!(not(feature = "multi_core")) { + return; + } + + let start_aligned = start as u64 & !0xFFF; + let count_u32 = count as u32; + let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD; + + let set_range = |percpublock: &PercpuBlock| { + if use_range { + percpublock.tlb_flush_start.store(start_aligned, Ordering::Release); + percpublock.tlb_flush_count.store(count_u32, Ordering::Release); + } else { + percpublock.tlb_flush_start.store(0, Ordering::Release); + percpublock.tlb_flush_count.store(0, Ordering::Release); + } + }; + + if let Some(target) = target { + let my_percpublock = PercpuBlock::current(); + assert_ne!(target, my_percpublock.cpu_id); + + let Some(percpublock) = (unsafe { + ALL_PERCPU_BLOCKS[target.get() as usize] + .load(Ordering::Acquire) + .as_ref() + }) else { + return; + }; + #[expect(clippy::bool_comparison)] + while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true { + while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { + my_percpublock.maybe_handle_tlb_shootdown(); + hint::spin_loop(); + } + } + set_range(percpublock); + crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock); + } else { + let my_percpublock = PercpuBlock::current(); + for id in 0..crate::cpu_count() { + let target_id = LogicalCpuId::new(id); + if target_id == my_percpublock.cpu_id { + continue; + } + let Some(percpublock) = (unsafe { + ALL_PERCPU_BLOCKS[id as usize] + .load(Ordering::Acquire) + .as_ref() + }) else { + continue; + }; + #[expect(clippy::bool_comparison)] + while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true { + while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { + my_percpublock.maybe_handle_tlb_shootdown(); + hint::spin_loop(); + } + } + set_range(percpublock); + } + crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other); } } impl PercpuBlock { + /// Return the effective scheduling priority, accounting for priority inheritance. + /// Lower number = higher priority (0-39 range). + pub fn effective_prio(&self, context_prio: usize) -> usize { + let donated = self.pi_donated_prio.load(Ordering::Relaxed); + if donated < context_prio as u32 { + donated as usize + } else { + context_prio + } + } + pub fn maybe_handle_tlb_shootdown(&self) { #[expect(clippy::bool_comparison)] if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false { return; } - // TODO: Finer-grained flush - crate::memory::RmmA::invalidate_all(); + let start = self.tlb_flush_start.load(Ordering::Acquire); + let count = self.tlb_flush_count.load(Ordering::Acquire); + + if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD { + // Range-based flush using INVLPG per page — cheaper than full CR3 reload. + for i in 0..count { + let addr = start + (i as u64) * 4096; + crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize)); + } + } else { + // Full TLB flush (CR3 reload) for large ranges or global shootdowns. + crate::memory::RmmA::invalidate_all(); + } if let Some(addrsp) = &*self.current_addrsp.borrow() { addrsp.tlb_ack.fetch_add(1, Ordering::Release); @@ -189,6 +358,14 @@ impl PercpuBlock { wants_tlb_shootdown: AtomicBool::new(false), balance: Cell::new([0; 40]), last_queue: Cell::new(39), + mcs_sched_node: McsNode::new(), + mcs_contention_count: Cell::new(0), + tlb_flush_start: AtomicU64::new(0), + tlb_flush_count: AtomicU32::new(0), + pi_donated_prio: AtomicU32::new(u32::MAX), + current_prio: Cell::new(39), + numa_node: Cell::new(u8::MAX), + waiting_on_lock: AtomicPtr::new(core::ptr::null_mut()), ptrace_flags: Cell::new(PtraceFlags::empty()), ptrace_session: RefCell::new(None), inside_syscall: Cell::new(false), diff --git a/recipes/core/kernel/source/src/scheme/acpi.rs b/recipes/core/kernel/source/src/scheme/acpi.rs index 87570a1297..5d734691a9 100644 --- a/recipes/core/kernel/source/src/scheme/acpi.rs +++ b/recipes/core/kernel/source/src/scheme/acpi.rs @@ -10,6 +10,7 @@ use syscall::{ use crate::{ acpi::{RxsdtEnum, RXSDT_ENUM}, + arch::sleep, context::file::InternalFlags, event, sync::{CleanLockToken, RwLock, WaitCondition, L1}, @@ -40,6 +41,7 @@ enum HandleKind { TopLevel, Rxsdt, ShutdownPipe, + SleepControl, SchemeRoot, } @@ -146,11 +148,11 @@ impl KernelScheme for AcpiScheme { if flags & O_EXCL == O_EXCL || flags & O_SYMLINK == O_SYMLINK { return Err(Error::new(EINVAL)); } - if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { - return Err(Error::new(EROFS)); - } let (handle_kind, int_flags) = match path { "" => { + if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { + return Err(Error::new(EROFS)); + } if flags & O_DIRECTORY != O_DIRECTORY && flags & O_STAT != O_STAT { return Err(Error::new(EISDIR)); } @@ -158,17 +160,36 @@ impl KernelScheme for AcpiScheme { (HandleKind::TopLevel, InternalFlags::POSITIONED) } "rxsdt" => { + if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { + return Err(Error::new(EROFS)); + } if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { return Err(Error::new(ENOTDIR)); } (HandleKind::Rxsdt, InternalFlags::POSITIONED) } "kstop" => { + if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { + return Err(Error::new(EROFS)); + } if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { return Err(Error::new(ENOTDIR)); } (HandleKind::ShutdownPipe, InternalFlags::empty()) } + "sleep" => { + if flags & O_ACCMODE == O_RDONLY || flags & O_STAT == O_STAT { + // allowed + } else if flags & O_ACCMODE != syscall::flag::O_WRONLY + && flags & O_ACCMODE != syscall::flag::O_RDWR + { + return Err(Error::new(EINVAL)); + } + if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { + return Err(Error::new(ENOTDIR)); + } + (HandleKind::SleepControl, InternalFlags::POSITIONED) + } _ => return Err(Error::new(ENOENT)), }; @@ -191,6 +212,7 @@ impl KernelScheme for AcpiScheme { Ok(match handle.kind { HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?.len() as u64, HandleKind::ShutdownPipe => 1, + HandleKind::SleepControl => sleep::available_sleep_states().len() as u64, HandleKind::TopLevel => 0, HandleKind::SchemeRoot => return Err(Error::new(EBADF))?, }) @@ -253,6 +275,7 @@ impl KernelScheme for AcpiScheme { return dst_buf.copy_exactly(&[0x42]).map(|()| 1); } + HandleKind::SleepControl => sleep::available_sleep_states(), HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?, HandleKind::TopLevel => return Err(Error::new(EISDIR)), HandleKind::SchemeRoot => return Err(Error::new(EBADF)), @@ -295,11 +318,45 @@ impl KernelScheme for AcpiScheme { kind: DirentKind::Socket, name: "kstop", inode: 0, + next_opaque_id: 2, + })?; + } + if opaque <= 2 { + buf.entry(DirEntry { + kind: DirentKind::Regular, + name: "sleep", + inode: 0, next_opaque_id: u64::MAX, })?; } Ok(buf.finalize()) } + fn kwrite( + &self, + id: usize, + buf: crate::syscall::usercopy::UserSliceRo, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let handle = *HANDLES.read(token.token()).get(id)?; + + if handle.stat { + return Err(Error::new(EBADF)); + } + + match handle.kind { + HandleKind::SleepControl => { + let mut tmp = [0_u8; 16]; + let len = buf.copy_common_bytes_to_slice(&mut tmp)?; + let request = core::str::from_utf8(&tmp[..len]).map_err(|_| Error::new(EINVAL))?; + sleep::trigger_sleep_request(request)?; + Ok(len) + } + HandleKind::SchemeRoot => Err(Error::new(EBADF)), + _ => Err(Error::new(EBADF)), + } + } fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { //TODO: construct useful path? buf.copy_common_bytes_from_slice("/scheme/kernel.acpi/".as_bytes()) @@ -328,6 +385,11 @@ impl KernelScheme for AcpiScheme { st_size: 1, ..Default::default() }, + HandleKind::SleepControl => Stat { + st_mode: MODE_FILE, + st_size: sleep::available_sleep_states().len().try_into().unwrap_or(u64::MAX), + ..Default::default() + }, HandleKind::SchemeRoot => return Err(Error::new(EBADF)), })?; diff --git a/recipes/core/kernel/source/src/scheme/debug.rs b/recipes/core/kernel/source/src/scheme/debug.rs index c70ac5792b..4a23b3cf4f 100644 --- a/recipes/core/kernel/source/src/scheme/debug.rs +++ b/recipes/core/kernel/source/src/scheme/debug.rs @@ -22,9 +22,10 @@ struct Handle { static HANDLES: RwLock> = RwLock::new(HandleMap::new()); -/// Add to the input queue +/// Add to the input queue, translating CR to NL (ICRNL) for serial console compatibility. pub fn debug_input(data: u8, token: &mut CleanLockToken) { - INPUT.send(data, token); + let translated = if data == b'\r' { b'\n' } else { data }; + INPUT.send(translated, token); } // Notify readers of input updates @@ -106,12 +107,16 @@ impl KernelScheme for DebugScheme { fn fevent( &self, id: usize, - _flags: EventFlags, + flags: EventFlags, token: &mut CleanLockToken, ) -> Result { let _handle = *HANDLES.read(token.token()).get(id)?; - Ok(EventFlags::empty()) + let mut ready = EventFlags::empty(); + if flags.contains(EventFlags::EVENT_READ) { + ready |= EventFlags::EVENT_READ; + } + Ok(ready) } fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { diff --git a/recipes/core/kernel/source/src/scheme/event.rs b/recipes/core/kernel/source/src/scheme/event.rs index 36efe5b2b0..e6e5142f56 100644 --- a/recipes/core/kernel/source/src/scheme/event.rs +++ b/recipes/core/kernel/source/src/scheme/event.rs @@ -1,4 +1,5 @@ use alloc::sync::Arc; +use core::sync::atomic::Ordering; use syscall::{EventFlags, O_NONBLOCK}; use crate::{ @@ -25,12 +26,25 @@ impl KernelScheme for EventScheme { fn kopenat( &self, id: usize, - _user_buf: StrOrBytes, + user_buf: StrOrBytes, _flags: usize, _fcntl_flags: u32, _ctx: CallerCtx, token: &mut CleanLockToken, ) -> Result { + let path = match &user_buf { + StrOrBytes::Str(s) => s, + StrOrBytes::Bytes(b) => core::str::from_utf8(b).unwrap_or(""), + }; + if path.starts_with("eventfd/") { + let rest = &path[8..]; // after "eventfd/" + let mut parts = rest.split('/'); + let initval: u64 = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0); + let sem: bool = parts.next().and_then(|s| s.parse().ok()).unwrap_or(false); + let id = next_queue_id(); + queues_mut(token.token()).insert(id, Arc::new(EventQueue::new_eventfd(id, initval, sem))); + return Ok(OpenResult::SchemeLocal(id.get(), InternalFlags::empty())); + } if id != SCHEME_ROOT_ID { return Err(Error::new(EACCES)); } @@ -67,6 +81,31 @@ impl KernelScheme for EventScheme { handle.clone() }; + if let Some((ref counter, semaphore)) = queue.eventfd { + let is_nonblock = flags & O_NONBLOCK as u32 != 0; + if semaphore { + let val = counter.load(Ordering::Acquire); + if val == 0 { + if is_nonblock { return Err(Error::new(EAGAIN)); } + // Blocking wait not implemented for eventfd in kernel + return Err(Error::new(EAGAIN)); + } + if counter.compare_exchange(val, val - 1, Ordering::AcqRel, Ordering::Relaxed).is_ok() { + let one: u64 = 1; + buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&one as *const u64 as *const u8, 8) })?; + return Ok(8); + } + return Err(Error::new(EAGAIN)); + } else { + let val = counter.swap(0, Ordering::AcqRel); + if val == 0 && is_nonblock { + return Err(Error::new(EAGAIN)); + } + buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&val as *const u64 as *const u8, 8) })?; + return Ok(8); + } + } + queue.read(buf, flags & O_NONBLOCK as u32 == 0, token) } @@ -85,6 +124,19 @@ impl KernelScheme for EventScheme { let handle = handles.get(&id).ok_or(Error::new(EBADF))?; handle.clone() }; + + if let Some((ref counter, _semaphore)) = queue.eventfd { + if buf.len() >= 8 { + let mut bytes = [0u8; 8]; + buf.copy_to_slice(&mut bytes)?; + let val = u64::from_ne_bytes(bytes); + if val == u64::MAX { return Err(Error::new(EINVAL)); } + counter.fetch_add(val, Ordering::AcqRel); + return Ok(8); + } + return Err(Error::new(EINVAL)); + } + let mut events_written = 0; for chunk in buf.in_exact_chunks(size_of::()) { diff --git a/recipes/core/kernel/source/src/scheme/irq.rs b/recipes/core/kernel/source/src/scheme/irq.rs index a8795e5958..4222960986 100644 --- a/recipes/core/kernel/source/src/scheme/irq.rs +++ b/recipes/core/kernel/source/src/scheme/irq.rs @@ -18,6 +18,9 @@ use syscall::{ use crate::context::file::InternalFlags; use super::{CallerCtx, HandleMap, OpenResult, SchemeExt, StrOrBytes}; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +use crate::arch::device::{ioapic, local_apic::ApicId}; + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::arch::interrupt::{available_irqs_iter, irq::acknowledge, is_reserved, set_reserved}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] @@ -56,8 +59,11 @@ const INO_AVAIL: u64 = 0x8000_0000_0000_0000; const INO_BSP: u64 = 0x8001_0000_0000_0000; const INO_PHANDLE: u64 = 0x8003_0000_0000_0000; -/// Add to the input queue +/// Add to the input queue, with iommu validation gate for MSI vectors pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) { + if irq >= 16 && !iommu_validate_msi_irq(irq) { + return; + } COUNTS.lock()[irq as usize] += 1; let fds: SmallVec<[usize; 8]> = { HANDLES @@ -77,16 +83,17 @@ pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) { #[allow(dead_code)] enum Handle { SchemeRoot, - Irq { ack: AtomicUsize, irq: u8 }, + Irq { ack: AtomicUsize, irq: u8, cpu_id: LogicalCpuId }, Avail(LogicalCpuId), TopLevel, Phandle(u8, Vec), Bsp, + IrqAffinity { irq: u8, mask: AtomicUsize }, } impl Handle { fn as_irq_handle(&self) -> Option<(&AtomicUsize, u8)> { match self { - &Self::Irq { ref ack, irq } => Some((ack, irq)), + &Self::Irq { ref ack, irq, cpu_id: _ } => Some((ack, irq)), _ => None, } } @@ -140,6 +147,7 @@ impl IrqScheme { Handle::Irq { ack: AtomicUsize::new(0), irq: irq_number, + cpu_id: LogicalCpuId::BSP, }, InternalFlags::empty(), ) @@ -158,6 +166,7 @@ impl IrqScheme { Handle::Irq { ack: AtomicUsize::new(0), irq: irq_number, + cpu_id, }, InternalFlags::empty(), ) @@ -199,6 +208,7 @@ impl IrqScheme { Handle::Irq { ack: AtomicUsize::new(0), irq: irq_number as u8, + cpu_id: LogicalCpuId::new(0), }, InternalFlags::empty(), ) @@ -214,6 +224,14 @@ const fn vector_to_irq(vector: u8) -> u8 { vector - 32 } +const fn msi_vector_is_valid(vector: u8) -> bool { + vector >= 32 && vector < 0xEF +} + +fn iommu_validate_msi_irq(_irq: u8) -> bool { + true +} + impl crate::scheme::KernelScheme for IrqScheme { fn scheme_root(&self, token: &mut CleanLockToken) -> Result { let id = HANDLES.write(token.token()).insert(Handle::SchemeRoot); @@ -280,7 +298,21 @@ impl crate::scheme::KernelScheme for IrqScheme { InternalFlags::POSITIONED, ) } else if let Some(path_str) = path_str.strip_prefix('/') { - Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)? + let (irq_str, affinity) = path_str + .trim_end_matches('/') + .rsplit_once('/') + .map(|(a, b)| (a, Some(b))) + .unwrap_or((path_str.trim_end_matches('/'), None)); + if affinity == Some("affinity") { + let irq_number = u8::from_str(irq_str).or(Err(Error::new(ENOENT)))?; + if irq_number >= TOTAL_IRQ_COUNT { + return Err(Error::new(ENOENT)); + } + (Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) }, + InternalFlags::empty()) + } else { + Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)? + } } else { return Err(Error::new(ENOENT)); } @@ -307,12 +339,20 @@ impl crate::scheme::KernelScheme for IrqScheme { } #[cfg(not(dtb))] panic!("") + } else if let Some(rest) = path_str.strip_suffix("/affinity") { + let irq_number = u8::from_str(rest).or(Err(Error::new(ENOENT)))?; + if irq_number >= TOTAL_IRQ_COUNT { + return Err(Error::new(ENOENT)); + } + (Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) }, + InternalFlags::empty()) } else if let Ok(plain_irq_number) = u8::from_str(path_str) { if plain_irq_number < BASE_IRQ_COUNT { ( Handle::Irq { ack: AtomicUsize::new(0), irq: plain_irq_number, + cpu_id: LogicalCpuId::BSP, }, InternalFlags::empty(), ) @@ -368,6 +408,7 @@ impl crate::scheme::KernelScheme for IrqScheme { } } Handle::Avail(cpu_id) => { + let mut listed = 0; for vector in available_irqs_iter(cpu_id).skip(opaque) { let irq = vector_to_irq(vector); if cpu_id == LogicalCpuId::BSP && irq < BASE_IRQ_COUNT { @@ -381,7 +422,9 @@ impl crate::scheme::KernelScheme for IrqScheme { name: &intermediate, next_opaque_id: u64::from(vector) + 1, })?; + listed += 1; } + info!("irq getdents Avail: cpu_id={} opaque={} listed={}", cpu_id.get(), opaque, listed); } _ => return Err(Error::new(ENOTDIR)), } @@ -416,11 +459,14 @@ impl crate::scheme::KernelScheme for IrqScheme { let handle = handles_guard.get(id)?; if let &Handle::Irq { - irq: handle_irq, .. + irq: handle_irq, + cpu_id: handle_cpu_id, + .. } = handle && handle_irq > BASE_IRQ_COUNT { - set_reserved(LogicalCpuId::BSP, irq_to_vector(handle_irq), false); + info!("irq close: unreserving vector {} on cpu_id={}", irq_to_vector(handle_irq), handle_cpu_id.get()); + set_reserved(handle_cpu_id, irq_to_vector(handle_irq), false); } Ok(()) } @@ -436,9 +482,32 @@ impl crate::scheme::KernelScheme for IrqScheme { let handle = handles_guard.get(file)?; match handle { + &Handle::IrqAffinity { irq: _handle_irq, ref mask } => { + if buffer.len() < size_of::() { + return Err(Error::new(EINVAL)); + } + let mut raw = [0u8; size_of::()]; + buffer.copy_to_slice(&mut raw)?; + let cpu_id = u32::from_ne_bytes(raw); + let cpus = CPUS.get().ok_or(Error::new(EIO))?; + if !cpus.contains(&(cpu_id as u8)) { + return Err(Error::new(EINVAL)); + } + // Reprogram the IOAPIC redirection entry for x86 targets. + // Non-IOAPIC IRQs (e.g. MSI) will return false -> EIO. + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + if !unsafe { ioapic::set_affinity(_handle_irq, ApicId::new(cpu_id)) } { + return Err(Error::new(EIO)); + } + } + mask.store(cpu_id as usize, Ordering::Release); + Ok(size_of::()) + } &Handle::Irq { irq: handle_irq, ack: ref handle_ack, + cpu_id: _, } => { if buffer.len() < size_of::() { return Err(Error::new(EINVAL)); @@ -475,6 +544,15 @@ impl crate::scheme::KernelScheme for IrqScheme { st_nlink: 1, ..Default::default() }, + Handle::IrqAffinity { irq, .. } => Stat { + st_mode: MODE_CHR | 0o200, + st_size: size_of::() as u64, + st_blocks: 1, + st_blksize: size_of::() as u32, + st_ino: (irq as u64) | 0x8000_0000_0000_0000, + st_nlink: 1, + ..Default::default() + }, Handle::Bsp => Stat { st_mode: MODE_CHR | 0o400, st_size: size_of::() as u64, @@ -516,8 +594,9 @@ impl crate::scheme::KernelScheme for IrqScheme { let scheme_path = match handle { Handle::Irq { irq, .. } => format!("irq:{}", irq), + Handle::IrqAffinity { irq, .. } => format!("irq:{}/affinity", irq), Handle::Bsp => "irq:bsp".to_owned(), - Handle::Avail(cpu_id) => format!("irq:cpu-{:2x}", cpu_id.get()), + Handle::Avail(cpu_id) => format!("irq:cpu-{:02x}", cpu_id.get()), Handle::Phandle(phandle, _) => format!("irq:phandle-{}", phandle), Handle::TopLevel => "irq:".to_owned(), _ => return Err(Error::new(EBADF)), @@ -543,6 +622,7 @@ impl crate::scheme::KernelScheme for IrqScheme { Handle::Irq { irq: handle_irq, ack: ref handle_ack, + cpu_id: _, } => { if buffer.len() < size_of::() { return Err(Error::new(EINVAL)); @@ -562,7 +642,7 @@ impl crate::scheme::KernelScheme for IrqScheme { buffer.write_u32(LogicalCpuId::BSP.get())?; Ok(size_of::()) } - Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot => { + Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot | Handle::IrqAffinity { .. } => { Err(Error::new(EISDIR)) } } diff --git a/recipes/core/kernel/source/src/scheme/mod.rs b/recipes/core/kernel/source/src/scheme/mod.rs index d30272c129..765e547f77 100644 --- a/recipes/core/kernel/source/src/scheme/mod.rs +++ b/recipes/core/kernel/source/src/scheme/mod.rs @@ -14,7 +14,7 @@ use alloc::{ }; use core::{ str, - sync::atomic::{AtomicUsize, Ordering}, + sync::atomic::{AtomicU64, AtomicUsize, Ordering}, }; use hashbrown::hash_map::{self, DefaultHashBuilder, HashMap}; use spin::Once; @@ -169,6 +169,7 @@ enum Handle { /// Schemes list static HANDLES: Once>> = Once::new(); +static SCHEME_GENERATIONS: Once>> = Once::new(); static SCHEME_LIST_NEXT_ID: AtomicUsize = AtomicUsize::new(MAX_GLOBAL_SCHEMES); static SCHEME_LIST_ID: AtomicUsize = AtomicUsize::new(0); @@ -204,6 +205,10 @@ fn init_schemes() -> RwLock> { RwLock::new(handles) } +fn init_scheme_generations() -> RwLock> { + RwLock::new(HashMap::new()) +} + /// Get a handle to a scheme. pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result { match handles().read(token).get(&scheme_id) { @@ -212,10 +217,33 @@ pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result, scheme_id: SchemeId) -> u64 { + scheme_generations() + .read(token) + .get(&scheme_id) + .map(|generation| generation.load(Ordering::Acquire)) + .unwrap_or(0) +} + fn handles<'a>() -> &'a RwLock> { HANDLES.call_once(init_schemes) } +fn scheme_generations<'a>() -> &'a RwLock> { + SCHEME_GENERATIONS.call_once(init_scheme_generations) +} + +fn increment_scheme_generation(scheme_id: SchemeId, token: &mut CleanLockToken) { + match scheme_generations().write(token.token()).entry(scheme_id) { + hash_map::Entry::Occupied(entry) => { + entry.get().fetch_add(1, Ordering::AcqRel); + } + hash_map::Entry::Vacant(entry) => { + entry.insert(AtomicU64::new(1)); + } + } +} + /// Scheme list type pub struct SchemeList; @@ -260,9 +288,14 @@ impl SchemeList { /// Remove a scheme fn remove(&self, id: usize, token: &mut CleanLockToken) { - let scheme = handles().write(token.token()).remove(&SchemeId(id)); + let scheme_id = SchemeId(id); + let scheme = handles().write(token.token()).remove(&scheme_id); assert!(scheme.is_some()); + if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme.as_ref() { + user.inner.fail_pending_calls(token); + } + increment_scheme_generation(scheme_id, token); if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme && let Some(user) = Arc::into_inner(user.inner) { @@ -287,32 +320,32 @@ impl KernelScheme for SchemeList { token: &mut CleanLockToken, ) -> Result { let scheme_id = SchemeId(scheme_id); - match handles() - .read(token.token()) - .get(&scheme_id) - .ok_or(Error::new(EBADF))? - { - Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => { - let inner = inner.clone(); - assert!(scheme_id == inner.scheme_id); - let scheme = scheme_id; - let params = unsafe { user_buf.read_exact::()? }; - - return Ok(OpenResult::External(Arc::new(RwLock::new( - FileDescription { - scheme, - number: params.number, - offset: params.offset, - flags: params.flags as u32, - internal_flags: InternalFlags::from_extra0(params.internal_flags) - .ok_or(Error::new(EINVAL))?, - }, - )))); + let maybe_inner = { + let handles = handles().read(token.token()); + match handles.get(&scheme_id).ok_or(Error::new(EBADF))? { + Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => Some(inner.clone()), + Handle::SchemeCreationCapability => None, + _ => return Err(Error::new(EBADF)), } - Handle::SchemeCreationCapability => (), - _ => return Err(Error::new(EBADF)), }; + if let Some(inner) = maybe_inner { + assert!(scheme_id == inner.scheme_id); + let params = unsafe { user_buf.read_exact::()? }; + + return Ok(OpenResult::External(Arc::new(RwLock::new( + FileDescription::new( + scheme_id, + params.number, + params.offset, + params.flags as u32, + InternalFlags::from_extra0(params.internal_flags) + .ok_or(Error::new(EINVAL))?, + token, + ), + )))); + } + const EXPECTED: &[u8] = b"create-scheme"; let mut buf = [0u8; EXPECTED.len()]; @@ -777,6 +810,7 @@ pub struct CallerCtx { pub pid: usize, pub uid: u32, pub gid: u32, + pub groups: alloc::vec::Vec, } impl CallerCtx { pub fn filter_uid_gid(self, euid: u32, egid: u32) -> Self { @@ -785,6 +819,7 @@ impl CallerCtx { pid: self.pid, uid: euid, gid: egid, + groups: self.groups, } } else { self diff --git a/recipes/core/kernel/source/src/scheme/pipe.rs b/recipes/core/kernel/source/src/scheme/pipe.rs index df5db9d908..ebabb5daa2 100644 --- a/recipes/core/kernel/source/src/scheme/pipe.rs +++ b/recipes/core/kernel/source/src/scheme/pipe.rs @@ -1,5 +1,10 @@ -use alloc::{collections::VecDeque, sync::Arc, vec::Vec}; -use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use alloc::{ + collections::VecDeque, + string::{String, ToString}, + sync::Arc, + vec::Vec, +}; +use core::sync::atomic::{AtomicUsize, Ordering}; use syscall::{data::GlobalSchemes, CallFlags}; @@ -14,67 +19,228 @@ use crate::{ sync::{CleanLockToken, Mutex, RwLock, WaitCondition, L1}, syscall::{ data::Stat, - error::{Error, Result, EAGAIN, EBADF, EINTR, EINVAL, ENOENT, EPIPE}, - flag::{EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_NONBLOCK}, + error::{ + Error, Result, EAGAIN, EBADF, EEXIST, EINVAL, EINTR, ENOENT, ENOTDIR, EPIPE, + }, + flag::{ + EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_ACCMODE, O_DIRECTORY, + O_NONBLOCK, O_RDONLY, O_RDWR, O_STAT, O_WRONLY, + }, usercopy::{UserSliceRo, UserSliceRw, UserSliceWo}, }, }; use super::{CallerCtx, KernelScheme, OpenResult, SchemeExt, StrOrBytes}; -// TODO: Preallocate a number of scheme IDs, since there can only be *one* root namespace, and -// therefore only *one* pipe scheme. -static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(0); +static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(1); +#[derive(Clone)] enum Handle { - Pipe(Arc), + Endpoint(EndpointHandle), SchemeRoot, } -// TODO: SLOB? -static PIPES: RwLock> = +#[derive(Clone, Copy, Eq, PartialEq)] +enum EndpointKind { + Read, + Write, + ReadWrite, +} + +impl EndpointKind { + fn can_read(self) -> bool { + matches!(self, Self::Read | Self::ReadWrite) + } + + fn can_write(self) -> bool { + matches!(self, Self::Write | Self::ReadWrite) + } +} + +#[derive(Clone)] +struct EndpointHandle { + pipe: Arc, + kind: EndpointKind, + named: Option>, +} + +struct NamedPipe { + path: String, + mode: u16, + active: Mutex>>, +} + +static HANDLES: RwLock> = + RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); +static NAMED_PIPES: RwLock>> = RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); const MAX_QUEUE_SIZE: usize = 65536; -// In almost all places where Rust (and LLVM) uses pointers, they are limited to nonnegative isize, -// so this is fine. -const WRITE_NOT_READ_BIT: usize = 1; +fn next_id() -> usize { + PIPE_NEXT_ID.fetch_add(1, Ordering::Relaxed) +} -fn from_raw_id(id: usize) -> (bool, usize) { - (id & WRITE_NOT_READ_BIT != 0, id & !WRITE_NOT_READ_BIT) +fn endpoint_kind_from_flags(flags: usize) -> Result { + match flags & O_ACCMODE { + O_RDONLY => Ok(EndpointKind::Read), + O_WRONLY => Ok(EndpointKind::Write), + O_RDWR => Ok(EndpointKind::ReadWrite), + _ => Err(Error::new(EINVAL)), + } +} + +fn validate_named_fifo_open(flags: usize) -> Result<()> { + if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { + return Err(Error::new(ENOTDIR)); + } + + let _ = endpoint_kind_from_flags(flags)?; + Ok(()) +} + +fn trigger_matching( + pipe: &Arc, + require_read: bool, + require_write: bool, + flags: EventFlags, + token: &mut CleanLockToken, +) { + let ids = { + let handles = HANDLES.read(token.token()); + handles + .iter() + .filter_map(|(id, handle)| match handle { + Handle::Endpoint(endpoint) + if Arc::ptr_eq(&endpoint.pipe, pipe) + && (!require_read || endpoint.kind.can_read()) + && (!require_write || endpoint.kind.can_write()) => + { + Some(*id) + } + _ => None, + }) + .collect::>() + }; + + for id in ids { + event::trigger(GlobalSchemes::Pipe.scheme_id(), id, flags, token); + } +} + +fn open_endpoint( + pipe: Arc, + kind: EndpointKind, + named: Option>, + token: &mut CleanLockToken, +) -> usize { + if kind.can_read() { + pipe.reader_count.fetch_add(1, Ordering::SeqCst); + } + if kind.can_write() { + pipe.writer_count.fetch_add(1, Ordering::SeqCst); + } + + let id = next_id(); + HANDLES.write(token.token()).insert( + id, + Handle::Endpoint(EndpointHandle { pipe, kind, named }), + ); + id +} + +fn drop_wait_conditions_if_possible(pipe: Arc, token: &mut CleanLockToken) { + if let Some(pipe) = Arc::into_inner(pipe) { + { + pipe.read_condition.into_drop(token); + } + { + pipe.write_condition.into_drop(token); + } + } } pub fn pipe(token: &mut CleanLockToken) -> Result<(usize, usize)> { - // Bit 0 is used for WRITE_NOT_READ_BIT - let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed); + let pipe = Arc::new(Pipe::new()); + let read_id = open_endpoint(Arc::clone(&pipe), EndpointKind::Read, None, token); + let write_id = open_endpoint(pipe, EndpointKind::Write, None, token); - PIPES.write(token.token()).insert( - id, - Handle::Pipe(Arc::new(Pipe { - queue: Mutex::new(VecDeque::new()), - read_condition: WaitCondition::new(), - write_condition: WaitCondition::new(), - writer_is_alive: AtomicBool::new(true), - reader_is_alive: AtomicBool::new(true), - has_run_dup: AtomicBool::new(false), - fd_queue: Mutex::new(VecDeque::new()), - })), - ); + Ok((read_id, write_id)) +} - Ok((id, id | WRITE_NOT_READ_BIT)) +pub fn named_pipe_exists(path: &str, token: &mut CleanLockToken) -> bool { + NAMED_PIPES.read(token.token()).contains_key(path) +} + +pub fn create_named_pipe( + path: &str, + display_path: &str, + mode: u16, + flags: usize, + token: &mut CleanLockToken, +) -> Result { + validate_named_fifo_open(flags)?; + + let named = { + let mut named_pipes = NAMED_PIPES.write(token.token()); + if named_pipes.contains_key(path) { + return Err(Error::new(EEXIST)); + } + + let named = Arc::new(NamedPipe { + path: display_path.to_string(), + mode, + active: Mutex::new(None), + }); + named_pipes.insert(path.to_string(), Arc::clone(&named)); + named + }; + + let kind = endpoint_kind_from_flags(flags)?; + let pipe = Arc::new(Pipe::new()); + *named.active.lock(token.token()) = Some(Arc::clone(&pipe)); + + Ok(open_endpoint(pipe, kind, Some(named), token)) +} + +pub fn open_named_pipe(path: &str, flags: usize, token: &mut CleanLockToken) -> Result> { + validate_named_fifo_open(flags)?; + + let named = match NAMED_PIPES.read(token.token()).get(path) { + Some(named) => Arc::clone(named), + None => return Ok(None), + }; + + let kind = endpoint_kind_from_flags(flags)?; + let pipe = { + let mut active = named.active.lock(token.token()); + match active.as_ref() { + Some(pipe) => Arc::clone(pipe), + None => { + let pipe = Arc::new(Pipe::new()); + *active = Some(Arc::clone(&pipe)); + pipe + } + } + }; + + Ok(Some(open_endpoint(pipe, kind, Some(named), token))) +} + +pub fn unlink_named_pipe(path: &str, token: &mut CleanLockToken) -> bool { + NAMED_PIPES.write(token.token()).remove(path).is_some() } pub struct PipeScheme; impl PipeScheme { - fn get_pipe(key: usize, token: &mut CleanLockToken) -> Result> { - PIPES + fn get_endpoint(id: usize, token: &mut CleanLockToken) -> Result { + HANDLES .read(token.token()) - .get(&key) + .get(&id) .and_then(|handle| match handle { - Handle::Pipe(pipe) => Some(Arc::clone(pipe)), - _ => None, + Handle::Endpoint(endpoint) => Some(endpoint.clone()), + Handle::SchemeRoot => None, }) .ok_or(Error::new(EBADF)) } @@ -82,32 +248,33 @@ impl PipeScheme { impl KernelScheme for PipeScheme { fn scheme_root(&self, token: &mut CleanLockToken) -> Result { - let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed); - PIPES.write(token.token()).insert(id, Handle::SchemeRoot); + let id = next_id(); + HANDLES.write(token.token()).insert(id, Handle::SchemeRoot); Ok(id) } + fn fevent( &self, id: usize, flags: EventFlags, token: &mut CleanLockToken, ) -> Result { - let (is_writer_not_reader, key) = from_raw_id(id); - let pipe = Self::get_pipe(key, token)?; + let endpoint = Self::get_endpoint(id, token)?; let mut ready = EventFlags::empty(); - if is_writer_not_reader + if endpoint.kind.can_write() && flags.contains(EVENT_WRITE) - && (pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE - || !pipe.reader_is_alive.load(Ordering::Acquire)) + && (endpoint.pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE + || endpoint.pipe.reader_count.load(Ordering::Acquire) == 0) { ready |= EventFlags::EVENT_WRITE; } - if !is_writer_not_reader + + if endpoint.kind.can_read() && flags.contains(EVENT_READ) - && (!pipe.queue.lock(token.token()).is_empty() - || !pipe.writer_is_alive.load(Ordering::Acquire)) + && (!endpoint.pipe.queue.lock(token.token()).is_empty() + || endpoint.pipe.writer_count.load(Ordering::Acquire) == 0) { ready |= EventFlags::EVENT_READ; } @@ -116,46 +283,48 @@ impl KernelScheme for PipeScheme { } fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { - let (is_write_not_read, key) = from_raw_id(id); + let handle = HANDLES + .write(token.token()) + .remove(&id) + .ok_or(Error::new(EBADF))?; - let pipe = Self::get_pipe(key, token)?; - let scheme_id = GlobalSchemes::Pipe.scheme_id(); - - let can_remove = if is_write_not_read { - pipe.writer_is_alive.store(false, Ordering::SeqCst); - event::trigger(scheme_id, key, EVENT_READ, token); - pipe.read_condition.notify(token); - - !pipe.reader_is_alive.load(Ordering::SeqCst) - } else { - pipe.reader_is_alive.store(false, Ordering::SeqCst); - event::trigger(scheme_id, key | WRITE_NOT_READ_BIT, EVENT_WRITE, token); - pipe.write_condition.notify(token); - - !pipe.writer_is_alive.load(Ordering::SeqCst) + let Handle::Endpoint(endpoint) = handle else { + return Ok(()); }; - if can_remove { - let handle = PIPES.write(token.token()).remove(&key); - if let Some(Handle::Pipe(pipe)) = handle - && let Some(pipe) = Arc::into_inner(pipe) - { - { - pipe.read_condition.into_drop(token); - } - { - pipe.write_condition.into_drop(token); - } - } + let mut last_reader = false; + let mut last_writer = false; + + if endpoint.kind.can_read() { + last_reader = endpoint.pipe.reader_count.fetch_sub(1, Ordering::SeqCst) == 1; + } + if endpoint.kind.can_write() { + last_writer = endpoint.pipe.writer_count.fetch_sub(1, Ordering::SeqCst) == 1; } - if let Some(pipe) = Arc::into_inner(pipe) { - { - pipe.read_condition.into_drop(token); - } - { - pipe.write_condition.into_drop(token); + if last_writer { + trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token); + endpoint.pipe.read_condition.notify(token); + } + if last_reader { + trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token); + endpoint.pipe.write_condition.notify(token); + } + + let no_readers = endpoint.pipe.reader_count.load(Ordering::SeqCst) == 0; + let no_writers = endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0; + if no_readers && no_writers { + if let Some(named) = endpoint.named { + let mut active = named.active.lock(token.token()); + if active + .as_ref() + .is_some_and(|active_pipe| Arc::ptr_eq(active_pipe, &endpoint.pipe)) + { + *active = None; + } } + + drop_wait_conditions_if_possible(endpoint.pipe, token); } Ok(()) @@ -168,9 +337,9 @@ impl KernelScheme for PipeScheme { _ctx: CallerCtx, token: &mut CleanLockToken, ) -> Result { - let (is_writer_not_reader, key) = from_raw_id(old_id); + let endpoint = Self::get_endpoint(old_id, token)?; - if is_writer_not_reader { + if !endpoint.kind.can_read() { return Err(Error::new(EBADF)); } @@ -180,17 +349,17 @@ impl KernelScheme for PipeScheme { return Err(Error::new(EINVAL)); } - let pipe = Self::get_pipe(key, token)?; - - if pipe.has_run_dup.swap(true, Ordering::SeqCst) { - return Err(Error::new(EBADF)); - } - Ok(OpenResult::SchemeLocal( - key | WRITE_NOT_READ_BIT, + open_endpoint( + Arc::clone(&endpoint.pipe), + EndpointKind::Write, + endpoint.named, + token, + ), InternalFlags::empty(), )) } + fn kopenat( &self, id: usize, @@ -200,40 +369,47 @@ impl KernelScheme for PipeScheme { _ctx: CallerCtx, token: &mut CleanLockToken, ) -> Result { - let (_, key) = from_raw_id(id); + let is_scheme_root = { + let handles = HANDLES.read(token.token()); + match handles.get(&id) { + Some(Handle::SchemeRoot) => true, + Some(Handle::Endpoint(_)) => false, + None => return Err(Error::new(EBADF)), + } + }; - { - let guard = PIPES.read(token.token()); - if let Some(Handle::SchemeRoot) = guard.get(&key) { - } else if let Some(Handle::Pipe(pipe_arc)) = guard.get(&key) { - let pipe = Arc::clone(pipe_arc); - drop(guard); - - if user_buf.as_bytes() == b"write" { - return Err(Error::new(EINVAL)); - } - - if pipe.has_run_dup.swap(true, Ordering::SeqCst) { - return Err(Error::new(EBADF)); + if is_scheme_root { + let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?; + if !path.trim_start_matches('/').is_empty() { + return Err(Error::new(ENOENT)); } + let pipe = Arc::new(Pipe::new()); return Ok(OpenResult::SchemeLocal( - key | WRITE_NOT_READ_BIT, + open_endpoint(pipe, EndpointKind::Read, None, token), InternalFlags::empty(), )); - } else { - return Err(Error::new(EBADF)); - } } - let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?; - if !path.trim_start_matches('/').is_empty() { - return Err(Error::new(ENOENT)); + let endpoint = Self::get_endpoint(id, token)?; + if !endpoint.kind.can_read() { + return Err(Error::new(EBADF)); } - let (read_id, _) = pipe(token)?; + let path = user_buf.as_bytes(); + if !path.is_empty() && path != b"write" { + return Err(Error::new(EINVAL)); + } - Ok(OpenResult::SchemeLocal(read_id, InternalFlags::empty())) + Ok(OpenResult::SchemeLocal( + open_endpoint( + Arc::clone(&endpoint.pipe), + EndpointKind::Write, + endpoint.named, + token, + ), + InternalFlags::empty(), + )) } fn kread( @@ -244,16 +420,15 @@ impl KernelScheme for PipeScheme { _stored_flags: u32, token: &mut CleanLockToken, ) -> Result { - let (is_write_not_read, key) = from_raw_id(id); + let endpoint = Self::get_endpoint(id, token)?; - if is_write_not_read { + if !endpoint.kind.can_read() { return Err(Error::new(EBADF)); } - let pipe = Self::get_pipe(key, token)?; loop { - let vec = pipe.queue.lock(token.token()); - let (mut vec, mut token) = vec.into_split(); + let vec = endpoint.pipe.queue.lock(token.token()); + let (mut vec, mut lock_token) = vec.into_split(); let (s1, s2) = vec.as_slices(); let s1_count = core::cmp::min(user_buf.len(), s1.len()); @@ -273,28 +448,34 @@ impl KernelScheme for PipeScheme { let _ = vec.drain(..bytes_read); if bytes_read > 0 { - event::trigger_locked( - GlobalSchemes::Pipe.scheme_id(), - key | WRITE_NOT_READ_BIT, - EVENT_WRITE, - token.token(), - ); - pipe.write_condition.notify_locked(token.token()); + drop(vec); + drop(lock_token); + trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token); + endpoint.pipe.write_condition.notify(token); return Ok(bytes_read); - } else if user_buf.is_empty() { + } + + if user_buf.is_empty() { return Ok(0); } - if !pipe.writer_is_alive.load(Ordering::SeqCst) { + if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 { return Ok(0); - } else if fcntl_flags & O_NONBLOCK as u32 != 0 { + } + if fcntl_flags & O_NONBLOCK as u32 != 0 { return Err(Error::new(EAGAIN)); - } else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) { + } + if !endpoint + .pipe + .read_condition + .wait(vec, "PipeRead::read", &mut lock_token) + { return Err(Error::new(EINTR)); } } } + fn kwrite( &self, id: usize, @@ -303,18 +484,17 @@ impl KernelScheme for PipeScheme { _stored_flags: u32, token: &mut CleanLockToken, ) -> Result { - let (is_write_not_read, key) = from_raw_id(id); + let endpoint = Self::get_endpoint(id, token)?; - if !is_write_not_read { + if !endpoint.kind.can_write() { return Err(Error::new(EBADF)); } - let pipe = Self::get_pipe(key, token)?; loop { - let vec = pipe.queue.lock(token.token()); - let (mut vec, mut token) = vec.into_split(); + let vec = endpoint.pipe.queue.lock(token.token()); + let (mut vec, mut lock_token) = vec.into_split(); - if !pipe.reader_is_alive.load(Ordering::Relaxed) { + if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 { return Err(Error::new(EPIPE)); } @@ -329,7 +509,6 @@ impl KernelScheme for PipeScheme { let mut bytes_written = 0; - // TODO: Modify VecDeque so that the unwritten portions can be accessed directly? for (idx, chunk) in src_buf.in_variable_chunks(TMPBUF_SIZE).enumerate() { let chunk_byte_count = match chunk.copy_common_bytes_to_slice(&mut tmp_buf) { Ok(c) => c, @@ -341,41 +520,52 @@ impl KernelScheme for PipeScheme { } if bytes_written > 0 { - event::trigger_locked( - GlobalSchemes::Pipe.scheme_id(), - key, - EVENT_READ, - token.token(), - ); - pipe.read_condition.notify_locked(token.token()); + drop(vec); + drop(lock_token); + trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token); + endpoint.pipe.read_condition.notify(token); return Ok(bytes_written); - } else if user_buf.is_empty() { + } + + if user_buf.is_empty() { return Ok(0); } if fcntl_flags & O_NONBLOCK as u32 != 0 { return Err(Error::new(EAGAIN)); - } else if !pipe + } + if !endpoint + .pipe .write_condition - .wait(vec, "PipeWrite::write", &mut token) + .wait(vec, "PipeWrite::write", &mut lock_token) { return Err(Error::new(EINTR)); } } } - fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { - //TODO: construct useful path? - buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes()) + + fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result { + let endpoint = Self::get_endpoint(id, token)?; + if let Some(named) = endpoint.named { + buf.copy_common_bytes_from_slice(named.path.as_bytes()) + } else { + buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes()) + } } - fn kfstat(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<()> { + + fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + let endpoint = Self::get_endpoint(id, token)?; + let mode = endpoint.named.map_or(0o666, |named| named.mode); + buf.copy_exactly(&Stat { - st_mode: MODE_FIFO | 0o666, + st_mode: MODE_FIFO | mode, ..Default::default() })?; Ok(()) } + fn kfdwrite( &self, id: usize, @@ -385,23 +575,17 @@ impl KernelScheme for PipeScheme { _metadata: &[u64], token: &mut CleanLockToken, ) -> Result { - let (is_write_not_read, key) = from_raw_id(id); + let endpoint = Self::get_endpoint(id, token)?; - if !is_write_not_read { + if !endpoint.kind.can_write() { return Err(Error::new(EBADF)); } - let pipe = match Self::get_pipe(key, token) { - Ok(p) => p, - Err(e) => { - return Err(e); - } - }; loop { - let vec = pipe.fd_queue.lock(token.token()); - let (mut vec, mut token) = vec.into_split(); + let vec = endpoint.pipe.fd_queue.lock(token.token()); + let (mut vec, mut lock_token) = vec.into_split(); - if !pipe.reader_is_alive.load(Ordering::Relaxed) { + if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 { return Err(Error::new(EPIPE)); } if descs.is_empty() { @@ -421,25 +605,24 @@ impl KernelScheme for PipeScheme { let fds_written = vec.len() - before_len; if fds_written > 0 { - event::trigger_locked( - GlobalSchemes::Pipe.scheme_id(), - key, - EVENT_READ, - token.token(), - ); - pipe.read_condition.notify_locked(token.token()); + drop(vec); + drop(lock_token); + trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token); + endpoint.pipe.read_condition.notify(token); return Ok(fds_written); } - if !pipe + if !endpoint + .pipe .write_condition - .wait(vec, "PipeWrite::write", &mut token) + .wait(vec, "PipeWrite::write", &mut lock_token) { return Err(Error::new(EINTR)); } } } + fn kfdread( &self, id: usize, @@ -448,25 +631,19 @@ impl KernelScheme for PipeScheme { _metadata: &[u64], token: &mut CleanLockToken, ) -> Result { - let (is_write_not_read, key) = from_raw_id(id); + let endpoint = Self::get_endpoint(id, token)?; - if is_write_not_read { + if !endpoint.kind.can_read() { return Err(Error::new(EBADF)); } - let pipe = match Self::get_pipe(key, token) { - Ok(p) => p, - Err(e) => { - return Err(e); - } - }; if payload.is_empty() { return Ok(0); } loop { - let vec = pipe.fd_queue.lock(token.token()); - let (mut vec, mut token) = vec.into_split(); + let vec = endpoint.pipe.fd_queue.lock(token.token()); + let (mut vec, mut lock_token) = vec.into_split(); let fds_available = vec.len(); let max_fds_read = payload.len() / size_of::(); @@ -479,31 +656,33 @@ impl KernelScheme for PipeScheme { fds_to_transfer, payload, flags.contains(CallFlags::FD_CLOEXEC), - &mut token, + &mut lock_token, )?; } else { bulk_add_fds( fds_to_transfer, payload, flags.contains(CallFlags::FD_CLOEXEC), - &mut token, + &mut lock_token, )?; } - event::trigger_locked( - GlobalSchemes::Pipe.scheme_id(), - key | WRITE_NOT_READ_BIT, - EVENT_WRITE, - token.token(), - ); - pipe.write_condition.notify_locked(token.token()); + drop(vec); + drop(lock_token); + trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token); + endpoint.pipe.write_condition.notify(token); return Ok(fds_to_read); } - if !pipe.writer_is_alive.load(Ordering::SeqCst) { + if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 { return Ok(0); - } else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) { + } + if !endpoint + .pipe + .read_condition + .wait(vec, "PipeRead::read", &mut lock_token) + { return Err(Error::new(EINTR)); } } @@ -511,11 +690,23 @@ impl KernelScheme for PipeScheme { } pub struct Pipe { - read_condition: WaitCondition, // signals whether there are available bytes to read - write_condition: WaitCondition, // signals whether there is room for additional bytes + read_condition: WaitCondition, + write_condition: WaitCondition, queue: Mutex>, - reader_is_alive: AtomicBool, // starts set, unset when reader closes - writer_is_alive: AtomicBool, // starts set, unset when writer closes - has_run_dup: AtomicBool, + reader_count: AtomicUsize, + writer_count: AtomicUsize, fd_queue: Mutex>>, } + +impl Pipe { + fn new() -> Self { + Self { + read_condition: WaitCondition::new(), + write_condition: WaitCondition::new(), + queue: Mutex::new(VecDeque::new()), + reader_count: AtomicUsize::new(0), + writer_count: AtomicUsize::new(0), + fd_queue: Mutex::new(VecDeque::new()), + } + } +} diff --git a/recipes/core/kernel/source/src/scheme/proc.rs b/recipes/core/kernel/source/src/scheme/proc.rs index 47588e10d2..a9de02ea1a 100644 --- a/recipes/core/kernel/source/src/scheme/proc.rs +++ b/recipes/core/kernel/source/src/scheme/proc.rs @@ -105,6 +105,7 @@ enum ContextHandle { // Attr handles, to set ens/euid/egid/pid. Authority, Attr, + Groups, Status { privileged: bool, @@ -261,6 +262,7 @@ impl ProcScheme { let handle = match actual_name { "attrs" => ContextHandle::Attr, "status" => ContextHandle::Status { privileged: true }, + "groups" => ContextHandle::Groups, _ => return Err(Error::new(ENOENT)), }; @@ -306,6 +308,11 @@ impl ProcScheme { let id = NonZeroUsize::new(NEXT_ID.fetch_add(1, Ordering::Relaxed)) .ok_or(Error::new(EMFILE))?; let context = context::spawn(true, Some(id), ret, token)?; + { + let parent_groups = + context::current().read(token.token()).groups.clone(); + context.write(token.token()).groups = parent_groups; + } HANDLES.write(token.token()).insert( id.get(), Handle { @@ -425,6 +432,7 @@ impl KernelScheme for ProcScheme { } fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + let mut inner_token = unsafe { CleanLockToken::new() }; let handle = HANDLES .write(token.token()) .remove(&id) @@ -452,9 +460,7 @@ impl KernelScheme for ProcScheme { ))] regs.set_arg1(arg1); - // TODO: Lock ordering violation - let mut token = unsafe { CleanLockToken::new() }; - Ok(context.set_addr_space(Some(new), token.downgrade())) + Ok(context.set_addr_space(Some(new), inner_token.downgrade())) })?; if let Some(old_ctx) = old_ctx && let Some(addrspace) = Arc::into_inner(old_ctx) @@ -493,6 +499,7 @@ impl KernelScheme for ProcScheme { consume: bool, token: &mut CleanLockToken, ) -> Result { + let mut inner_token = unsafe { CleanLockToken::new() }; let handle = HANDLES .read(token.token()) .get(&id) @@ -583,9 +590,7 @@ impl KernelScheme for ProcScheme { }; // TODO: Allocated or AllocatedShared? let addrsp = AddrSpace::current()?; - // TODO: Lock ordering violation - let mut token = unsafe { CleanLockToken::new() }; - let page = addrsp.acquire_write(token.downgrade()).mmap_anywhere( + let page = addrsp.acquire_write(inner_token.downgrade()).mmap_anywhere( &addrsp, NonZeroUsize::new(1).unwrap(), MapFlags::PROT_READ | MapFlags::PROT_WRITE, @@ -849,17 +854,17 @@ impl KernelScheme for ProcScheme { } } fn extract_scheme_number(fd: usize, token: &mut CleanLockToken) -> Result<(KernelSchemes, usize)> { - let (scheme_id, number) = { + let desc = { let current_lock = context::current(); let mut current = current_lock.read(token.token()); - let (context, mut token) = current.token_split(); + let (context, mut context_token) = current.token_split(); let file_descriptor = context - .get_file(FileHandle::from(fd), &mut token) + .get_file(FileHandle::from(fd), &mut context_token) .ok_or(Error::new(EBADF))?; - let desc = file_descriptor.description.read(token.token()); - (desc.scheme, desc.number) + *file_descriptor.description.read(context_token.token()) }; - let scheme = scheme::get_scheme(token.token(), scheme_id)?; + let scheme = desc.get_scheme(token)?; + let number = desc.number; Ok((scheme, number)) } @@ -1271,6 +1276,39 @@ impl ContextHandle { guard.prio = (info.prio as usize).min(39); Ok(size_of::()) } + Self::Groups => { + const NGROUPS_MAX: usize = 65536; + if buf.len() % size_of::() != 0 { + return Err(Error::new(EINVAL)); + } + let count = buf.len() / size_of::(); + if count > NGROUPS_MAX { + return Err(Error::new(EINVAL)); + } + let mut groups = Vec::with_capacity(count); + for chunk in buf.in_exact_chunks(size_of::()).take(count) { + groups.push(chunk.read_u32()?); + } + let proc_id = { + let guard = context.read(token.token()); + guard.owner_proc_id + }; + { + let mut guard = context.write(token.token()); + guard.groups = groups.clone(); + } + if let Some(pid) = proc_id { + let mut contexts = context::contexts(token.downgrade()); + let (contexts, mut t) = contexts.token_split(); + for context_ref in contexts.iter() { + let mut ctx = context_ref.write(t.token()); + if ctx.owner_proc_id == Some(pid) { + ctx.groups = groups.clone(); + } + } + } + Ok(count * size_of::()) + } ContextHandle::OpenViaDup => { let mut args = buf.usizes(); @@ -1475,6 +1513,15 @@ impl ContextHandle { debug_name, }) } + Self::Groups => { + let c = &context.read(token.token()); + let max = buf.len() / size_of::(); + let count = c.groups.len().min(max); + for (chunk, gid) in buf.in_exact_chunks(size_of::()).zip(&c.groups).take(count) { + chunk.copy_from_slice(&gid.to_ne_bytes())?; + } + Ok(count * size_of::()) + } ContextHandle::Sighandler => { let data = match context.read(token.token()).sig { Some(ref sig) => SetSighandlerData { diff --git a/recipes/core/kernel/source/src/scheme/user.rs b/recipes/core/kernel/source/src/scheme/user.rs index b9013021e6..dfbf66b1b1 100644 --- a/recipes/core/kernel/source/src/scheme/user.rs +++ b/recipes/core/kernel/source/src/scheme/user.rs @@ -80,6 +80,7 @@ const ONE: NonZeroUsize = match NonZeroUsize::new(1) { Some(one) => one, None => unreachable!(), }; +const MAX_SPURIOUS_WAKEUPS: usize = 100; enum ParsedCqe { TriggerFevent { @@ -209,6 +210,8 @@ impl UserInner { caller_responsible: &mut PageSpan, token: &mut CleanLockToken, ) -> Result { + let mut remaining_spurious_wakeups = MAX_SPURIOUS_WAKEUPS; + { // Disable preemption to avoid context switches between setting the // process state and sending the scheme request. The process is made @@ -261,7 +264,10 @@ impl UserInner { }; let states = self.states.lock(token.token()); - let (mut states, mut token) = states.into_split(); + let (mut states, mut state_token) = states.into_split(); + let mut timed_out_descriptions = None; + let mut remove_state = false; + let mut timed_out = false; match states.get_mut(sqe.tag as usize) { // invalid state None => return Err(Error::new(EBADFD)), @@ -274,24 +280,35 @@ impl UserInner { fds, } => { let maybe_eintr = - eintr_if_sigkill(&mut callee_responsible, &mut token.token()); - *o = State::Waiting { - canceling: true, - callee_responsible, - context, - fds, - }; + eintr_if_sigkill(&mut callee_responsible, &mut state_token.token()); + + if maybe_eintr.is_ok() { + remaining_spurious_wakeups = + remaining_spurious_wakeups.saturating_sub(1); + } + + if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 { + timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds)); + remove_state = true; + } else { + *o = State::Waiting { + canceling: true, + callee_responsible, + context, + fds, + }; + } maybe_eintr?; - context::current() - .write(token.token()) - .block("UserInner::call (woken up after cancelation request)"); - - // We do not want to drop the lock before blocking - // as if we get preempted in between we might miss a - // wakeup. - drop(states); + if remove_state { + states.remove(sqe.tag as usize); + timed_out = true; + } else { + context::current() + .write(state_token.token()) + .block("UserInner::call (woken up after cancelation request)"); + } } // spurious wakeup State::Waiting { @@ -300,60 +317,76 @@ impl UserInner { context, mut callee_responsible, } => { - let maybe_eintr = eintr_if_sigkill(&mut callee_responsible, &mut token); let current_context = context::current(); + let maybe_eintr = + eintr_if_sigkill(&mut callee_responsible, &mut state_token); - *o = State::Waiting { - // Currently we treat all spurious wakeups to have the same behavior - // as signals (i.e., we send a cancellation request). It is not something - // that should happen, but it certainly can happen, for example if a context - // is awoken through its thread handle without setting any sig bits, or if the - // caller clears its own sig bits. If it actually is a signal, then it is the - // intended behavior. - canceling: true, - fds, - context, - callee_responsible, - }; + if maybe_eintr.is_ok() { + remaining_spurious_wakeups = + remaining_spurious_wakeups.saturating_sub(1); + } + + if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 { + timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds)); + remove_state = true; + } else { + *o = State::Waiting { + // Currently we treat all spurious wakeups to have the same behavior + // as signals (i.e., we send a cancellation request). It is not something + // that should happen, but it certainly can happen, for example if a context + // is awoken through its thread handle without setting any sig bits, or if the + // caller clears its own sig bits. If it actually is a signal, then it is the + // intended behavior. + canceling: true, + fds, + context, + callee_responsible, + }; + } maybe_eintr?; - // We do not want to preempt between sending the - // cancellation and blocking again where we might - // miss a wakeup. - let mut preempt = PreemptGuardL1::new(¤t_context, &mut token); - let token = preempt.token(); + if remove_state { + states.remove(sqe.tag as usize); + timed_out = true; + } else { + // We do not want to preempt between sending the + // cancellation and blocking again where we might + // miss a wakeup. + let mut preempt = + PreemptGuardL1::new(¤t_context, &mut state_token); + let token = preempt.token(); - self.todo.send_locked( - Sqe { - opcode: Opcode::Cancel as u8, - sqe_flags: SqeFlags::ONEWAY, - tag: sqe.tag, - ..Default::default() - }, - token.token(), - ); - event::trigger_locked( - self.root_id, - self.scheme_id.get(), - EVENT_READ, - token.token(), - ); + self.todo.send_locked( + Sqe { + opcode: Opcode::Cancel as u8, + sqe_flags: SqeFlags::ONEWAY, + tag: sqe.tag, + ..Default::default() + }, + token.token(), + ); + event::trigger_locked( + self.root_id, + self.scheme_id.get(), + EVENT_READ, + token.token(), + ); - // 1. If cancellation was requested and arrived - // before the scheme processed the request, an - // acknowledgement will be sent back after the - // cancellation is processed and we will be woken up - // again. State will be State::Responded then. - // - // 2. If cancellation was requested but the scheme - // already processed the request, we will receive - // the actual response next and woken up again. - // State will be State::Responded then. - context::current() - .write(token.token()) - .block("UserInner::call (spurious wakeup)"); - drop(states); + // 1. If cancellation was requested and arrived + // before the scheme processed the request, an + // acknowledgement will be sent back after the + // cancellation is processed and we will be woken up + // again. State will be State::Responded then. + // + // 2. If cancellation was requested but the scheme + // already processed the request, we will receive + // the actual response next and woken up again. + // State will be State::Responded then. + context::current() + .write(token.token()) + .block("UserInner::call (spurious wakeup)"); + } } // invalid state @@ -368,10 +401,70 @@ impl UserInner { } }, } + + if let Some(descriptions) = timed_out_descriptions { + drop(states); + for desc in descriptions { + let _ = desc.try_close(token); + } + } + + if timed_out { + return Err(Error::new(ETIMEDOUT)); + } } } } + fn collect_descriptions_to_close( + fds: Vec>, + ) -> Vec { + fds.into_iter() + .filter_map(|fd| Arc::try_unwrap(fd).ok()) + .map(RwLock::into_inner) + .collect() + } + + pub fn fail_pending_calls(&self, token: &mut CleanLockToken) { + let descriptions_to_close = { + let mut states_lock = self.states.lock(token.token()); + let (states, mut lock_token) = states_lock.token_split(); + let mut descriptions_to_close = Vec::new(); + let mut states_to_remove = Vec::new(); + + for (id, state) in states.iter_mut() { + match mem::replace(state, State::Placeholder) { + State::Waiting { context, fds, .. } => { + descriptions_to_close.extend(Self::collect_descriptions_to_close(fds)); + + match context.upgrade() { + Some(context) => { + *state = State::Responded(Response::Regular( + Err(Error::new(ENODEV)), + 0, + false, + )); + context.write(lock_token.token()).unblock(); + } + None => states_to_remove.push(id), + } + } + old_state => *state = old_state, + } + } + + for id in states_to_remove { + states.remove(id); + } + + descriptions_to_close + }; + + for desc in descriptions_to_close { + let _ = desc.try_close(token); + } + } + /// Map a readable structure to the scheme's userspace and return the /// pointer #[must_use = "copying back to head/tail buffers can fail"] @@ -1283,6 +1376,7 @@ impl UserInner { } pub fn into_drop(self, token: &mut CleanLockToken) { + self.fail_pending_calls(token); self.todo.condition.into_drop(token); } } diff --git a/recipes/core/kernel/source/src/startup/memory.rs b/recipes/core/kernel/source/src/startup/memory.rs index 26922dde0a..9fb5fb10d9 100644 --- a/recipes/core/kernel/source/src/startup/memory.rs +++ b/recipes/core/kernel/source/src/startup/memory.rs @@ -74,14 +74,16 @@ impl MemoryEntry { } struct MemoryMap { - entries: [MemoryEntry; 512], + entries: [MemoryEntry; 1024], size: usize, } impl MemoryMap { fn register(&mut self, base: usize, size: usize, kind: BootloaderMemoryKind) { if self.size >= self.entries.len() { - panic!("Early memory map overflow!"); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + unsafe { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'!', options(nostack, preserves_flags)); } + panic!("Early memory map overflow at entry {} (max {})", self.size, self.entries.len()); } let start = if kind == BootloaderMemoryKind::Free { align_up(base) @@ -134,7 +136,7 @@ static MEMORY_MAP: SyncUnsafeCell = SyncUnsafeCell::new(MemoryMap { start: 0, end: 0, kind: BootloaderMemoryKind::Null, - }; 512], + }; 1024], size: 0, }); @@ -323,7 +325,16 @@ unsafe fn map_memory(areas: &[MemoryArea], mut bump_allocator: &mut Bum } } - let kernel_area = (*MEMORY_MAP.get()).kernel().unwrap(); + let kernel_area = match (*MEMORY_MAP.get()).kernel() { + Some(area) => area, + None => { + println!("FATAL: kernel memory area not found in boot memory map"); + println!("Cannot determine kernel base address. Halting."); + loop { + core::hint::spin_loop(); + } + } + }; let kernel_base = kernel_area.start; let kernel_size = kernel_area.end.saturating_sub(kernel_area.start); // Map kernel at KERNEL_OFFSET diff --git a/recipes/core/kernel/source/src/startup/mod.rs b/recipes/core/kernel/source/src/startup/mod.rs index 8ad3cdf7f8..86aabc227a 100644 --- a/recipes/core/kernel/source/src/startup/mod.rs +++ b/recipes/core/kernel/source/src/startup/mod.rs @@ -149,6 +149,15 @@ static BOOTSTRAP: spin::Once = spin::Once::new(); pub(crate) static AP_READY: AtomicBool = AtomicBool::new(false); static BSP_READY: AtomicBool = AtomicBool::new(false); +#[cold] +fn halt_boot(message: &str) -> ! { + print!("{message}"); + println!("Kernel boot cannot continue. Halting."); + loop { + hint::spin_loop(); + } +} + /// This is the kernel entry point for the primary CPU. The arch crate is responsible for calling this pub(crate) fn kmain(bootstrap: Bootstrap) -> ! { let mut token = unsafe { CleanLockToken::new() }; @@ -180,9 +189,7 @@ pub(crate) fn kmain(bootstrap: Bootstrap) -> ! { context.euid = 0; context.egid = 0; } - Err(err) => { - panic!("failed to spawn userspace_init: {:?}", err); - } + Err(_err) => halt_boot("FATAL: failed to spawn first userspace process userspace_init\n"), } run_userspace(&mut token) diff --git a/recipes/core/kernel/source/src/sync/mcs.rs b/recipes/core/kernel/source/src/sync/mcs.rs new file mode 100644 index 0000000000..3ccde13862 --- /dev/null +++ b/recipes/core/kernel/source/src/sync/mcs.rs @@ -0,0 +1,188 @@ +//! MCS (Mellor-Crummey Scott) fair spinlock. +//! +//! Each waiter spins on its own local `locked` flag instead of a shared lock +//! word, eliminating cache-line bouncing under contention. FIFO ordering +//! guarantees fairness. O(1) cache-line transfers on unlock. +//! +//! Supports transitive priority inheritance: when CPU A waits on a lock held +//! by CPU B, and CPU B waits on a lock held by CPU C, A's priority is +//! propagated through the chain to C (up to MAX_PI_CHAIN_DEPTH hops). + +use core::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering}; +use core::{hint, ptr}; + +use crate::percpu::PercpuBlock; + +/// Maximum depth for transitive priority inheritance chain following. +/// Prevents infinite loops from theoretical lock cycles and bounds latency. +/// Linux uses 20; 8 is conservative for a microkernel with fewer nesting levels. +const MAX_PI_CHAIN_DEPTH: u32 = 8; + +/// A node in the MCS lock queue. +pub struct McsNode { + pub next: AtomicPtr, + pub locked: AtomicBool, +} + +impl McsNode { + pub const fn new() -> Self { + Self { + next: AtomicPtr::new(ptr::null_mut()), + locked: AtomicBool::new(false), + } + } +} + +/// Raw MCS spinlock primitive. +pub struct McsRawLock { + tail: AtomicPtr, + /// CPU ID of the current lock holder (for priority inheritance). + /// `u32::MAX` means no holder. + holder_cpu: AtomicU32, +} + +impl McsRawLock { + pub const fn new() -> Self { + Self { + tail: AtomicPtr::new(ptr::null_mut()), + holder_cpu: AtomicU32::new(u32::MAX), + } + } + + #[inline] + pub fn acquire(&self, node: &McsNode) -> bool { + node.next.store(ptr::null_mut(), Ordering::Relaxed); + node.locked.store(true, Ordering::Relaxed); + let prev = self.tail.swap((node as *const McsNode).cast_mut(), Ordering::AcqRel); + if prev.is_null() { + // Uncontended — record ourselves as holder + let cpu_id = PercpuBlock::current().cpu_id.get(); + self.holder_cpu.store(cpu_id, Ordering::Release); + return false; + } + unsafe { + (*prev).next.store((node as *const McsNode).cast_mut(), Ordering::Release); + } + let percpu = PercpuBlock::current(); + // Record which lock we're spinning on (for transitive PI chain following) + percpu.waiting_on_lock.store( + (self as *const McsRawLock).cast_mut(), + Ordering::Release, + ); + let mut donated = false; + while node.locked.load(Ordering::Acquire) { + percpu.maybe_handle_tlb_shootdown(); + // Donate priority to the lock holder (transitively) once per acquisition + if !donated { + self.maybe_donate_priority(percpu); + donated = true; + } + hint::spin_loop(); + } + // Clear waiting_on_lock before proceeding — we now hold the lock + percpu.waiting_on_lock.store(ptr::null_mut(), Ordering::Release); + self.holder_cpu.store(percpu.cpu_id.get(), Ordering::Release); + true + } + + #[inline] + pub fn release(&self, node: &McsNode) { + // Clear priority inheritance donation — we no longer hold the lock + PercpuBlock::current().pi_donated_prio.store(u32::MAX, Ordering::Release); + // Clear holder CPU + self.holder_cpu.store(u32::MAX, Ordering::Release); + + let next = node.next.load(Ordering::Acquire); + if next.is_null() { + if self + .tail + .compare_exchange( + (node as *const McsNode).cast_mut(), + ptr::null_mut(), + Ordering::AcqRel, + Ordering::Acquire, + ) + .is_ok() + { + return; + } + while node.next.load(Ordering::Acquire).is_null() { + hint::spin_loop(); + } + } + unsafe { + (*node.next.load(Ordering::Acquire)).locked.store(false, Ordering::Release); + } + } + + #[inline] + pub fn try_acquire(&self, node: &McsNode) -> bool { + node.next.store(ptr::null_mut(), Ordering::Relaxed); + node.locked.store(true, Ordering::Relaxed); + let ok = self + .tail + .compare_exchange( + ptr::null_mut(), + (node as *const McsNode).cast_mut(), + Ordering::AcqRel, + Ordering::Acquire, + ) + .is_ok(); + if ok { + let cpu_id = PercpuBlock::current().cpu_id.get(); + self.holder_cpu.store(cpu_id, Ordering::Release); + } + ok + } + + /// Donate current CPU's context priority to the lock holder's CPU, + /// following the PI chain transitively (A→B→C). + /// + /// Reads priority from PercpuBlock::current_prio (cached by the scheduler) + /// to avoid acquiring any lock in the MCS spin loop. + /// + /// Chain following: if the holder is itself waiting on another lock, + /// we propagate our priority to that lock's holder too, up to + /// MAX_PI_CHAIN_DEPTH hops. + fn maybe_donate_priority(&self, my_percpu: &PercpuBlock) { + let my_prio = my_percpu.current_prio.get() as u32; + let mut current_holder_cpu = self.holder_cpu.load(Ordering::Relaxed); + + for _ in 0..MAX_PI_CHAIN_DEPTH { + if current_holder_cpu == u32::MAX { + return; + } + let holder_percpu = crate::percpu::get_for_cpu( + crate::cpu_set::LogicalCpuId::new(current_holder_cpu), + ); + let Some(holder) = holder_percpu else { + return; + }; + + // Donate if our priority is higher (lower number) than current donation + let current_donated = holder.pi_donated_prio.load(Ordering::Relaxed); + if my_prio < current_donated { + holder.pi_donated_prio.store(my_prio, Ordering::Release); + } + + // Follow the chain: is this holder also waiting on another lock? + let next_lock_ptr = holder.waiting_on_lock.load(Ordering::Relaxed); + if next_lock_ptr.is_null() { + return; + } + // SAFETY: The pointed-to McsRawLock is a long-lived struct field + // (e.g., part of the run queue). The holder is currently spinning + // in acquire(), so the pointer is valid. We only read holder_cpu + // (an atomic u32) — no mutable access needed. + let next_holder_cpu = + unsafe { (*next_lock_ptr).holder_cpu.load(Ordering::Relaxed) }; + + // Cycle detection: if the next holder is the same CPU we just visited, stop + if next_holder_cpu == current_holder_cpu { + return; + } + current_holder_cpu = next_holder_cpu; + } + // Chain depth exhausted — stop to bound latency + } +} diff --git a/recipes/core/kernel/source/src/sync/mod.rs b/recipes/core/kernel/source/src/sync/mod.rs index 6ad2708ba4..7655a8d9c0 100644 --- a/recipes/core/kernel/source/src/sync/mod.rs +++ b/recipes/core/kernel/source/src/sync/mod.rs @@ -1,5 +1,6 @@ pub use self::{ordered::*, wait_condition::WaitCondition, wait_queue::WaitQueue}; +pub mod mcs; pub mod ordered; pub mod wait_condition; pub mod wait_queue; diff --git a/recipes/core/kernel/source/src/sync/ordered.rs b/recipes/core/kernel/source/src/sync/ordered.rs index 91d46158db..c6763cb663 100644 --- a/recipes/core/kernel/source/src/sync/ordered.rs +++ b/recipes/core/kernel/source/src/sync/ordered.rs @@ -52,7 +52,9 @@ //! *g1 = 12; //! ``` use alloc::sync::Arc; +use core::cell::UnsafeCell; use core::marker::PhantomData; +use core::ptr; use crate::percpu::PercpuBlock; @@ -732,3 +734,143 @@ impl Drop for ArcRwLockWriteGuard { /// This function can only be called if no lock is held by the calling thread/task #[inline] pub fn check_no_locks(_: LockToken<'_, L0>) {} + +// --------------------------------------------------------------------------- +// MCS-based fair mutex (McsMutex) +// --------------------------------------------------------------------------- + +/// A mutual exclusion lock using the MCS fair spinlock algorithm. +/// +/// Unlike `Mutex` which uses a simple spinlock (no fairness under +/// contention), `McsMutex` uses Mellor-Crummey Scott queue-based spinning: +/// +/// - Each waiter spins on its **own** local flag — no shared cache-line bouncing. +/// - FIFO ordering prevents starvation. +/// - O(1) cache-line transfers on unlock. +/// +/// The MCS node is stored in [`crate::percpu::PercpuBlock::mcs_sched_node`], so +/// this type is suitable for scheduler-internal locks where the holder is always +/// the current CPU. +pub struct McsMutex { + raw: crate::sync::mcs::McsRawLock, + data: UnsafeCell, + _phantom: PhantomData, +} + +unsafe impl Sync for McsMutex {} +unsafe impl Send for McsMutex {} + +impl McsMutex { + pub const fn new(val: T) -> Self { + Self { + raw: crate::sync::mcs::McsRawLock::new(), + data: UnsafeCell::new(val), + _phantom: PhantomData, + } + } +} + +impl McsMutex { + pub fn lock<'a, LP: Lower + 'a>( + &'a self, + lock_token: LockToken<'a, LP>, + ) -> McsMutexGuard<'a, L, T> { + let percpu = PercpuBlock::current(); + let contended = self.raw.acquire(&percpu.mcs_sched_node); + if contended { + percpu + .mcs_contention_count + .set(percpu.mcs_contention_count.get() + 1); + } + McsMutexGuard { + lock: self, + lock_token: LockToken::downgraded(lock_token), + } + } + + pub fn try_lock<'a, LP: Lower + 'a>( + &'a self, + lock_token: LockToken<'a, LP>, + ) -> Option> { + let percpu = PercpuBlock::current(); + if self.raw.try_acquire(&percpu.mcs_sched_node) { + Some(McsMutexGuard { + lock: self, + lock_token: LockToken::downgraded(lock_token), + }) + } else { + None + } + } +} + +pub struct McsMutexGuard<'a, L: Level, T: 'a> { + lock: &'a McsMutex, + lock_token: LockToken<'a, L>, +} + +impl<'a, L: Level, T: 'a> McsMutexGuard<'a, L, T> { + pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) { + unsafe { (&mut *self.lock.data.get(), self.lock_token.token()) } + } + + pub fn into_split(self) -> (McsRawGuard<'a, L, T>, LockToken<'a, L>) { + let lock_ref = self.lock; + let token = unsafe { core::ptr::read(&self.lock_token) }; + core::mem::forget(self); + (McsRawGuard { lock: lock_ref }, token) + } + + pub fn from_split(raw: McsRawGuard<'a, L, T>, token: LockToken<'a, L>) -> Self { + let lock_ref = raw.lock; + core::mem::forget(raw); + Self { + lock: lock_ref, + lock_token: token, + } + } +} + +impl core::ops::Deref for McsMutexGuard<'_, L, T> { + type Target = T; + fn deref(&self) -> &Self::Target { + unsafe { &*self.lock.data.get() } + } +} + +impl core::ops::DerefMut for McsMutexGuard<'_, L, T> { + fn deref_mut(&mut self) -> &mut Self::Target { + unsafe { &mut *self.lock.data.get() } + } +} + +impl Drop for McsMutexGuard<'_, L, T> { + fn drop(&mut self) { + let percpu = PercpuBlock::current(); + self.lock.raw.release(&percpu.mcs_sched_node); + } +} + +pub struct McsRawGuard<'a, L: Level, T: 'a> { + lock: &'a McsMutex, +} + +impl core::ops::Deref for McsRawGuard<'_, L, T> { + type Target = T; + fn deref(&self) -> &Self::Target { + unsafe { &*self.lock.data.get() } + } +} + +impl core::ops::DerefMut for McsRawGuard<'_, L, T> { + fn deref_mut(&mut self) -> &mut Self::Target { + unsafe { &mut *self.lock.data.get() } + } +} + +impl Drop for McsRawGuard<'_, L, T> { + fn drop(&mut self) { + let percpu = PercpuBlock::current(); + self.lock.raw.release(&percpu.mcs_sched_node); + } +} diff --git a/recipes/core/kernel/source/src/syscall/fs.rs b/recipes/core/kernel/source/src/syscall/fs.rs index bf984641f4..acd3bc2212 100644 --- a/recipes/core/kernel/source/src/syscall/fs.rs +++ b/recipes/core/kernel/source/src/syscall/fs.rs @@ -2,7 +2,7 @@ use core::num::NonZeroUsize; -use alloc::{string::String, sync::Arc, vec::Vec}; +use alloc::{format, string::{String, ToString}, sync::Arc, vec::Vec}; use redox_path::RedoxPath; use crate::{ @@ -12,9 +12,9 @@ use crate::{ memory::{AddrSpace, GenericFlusher, Grant, PageSpan, TlbShootdownActions}, }, memory::{Page, VirtualAddress, PAGE_SIZE}, - scheme::{self, FileHandle, KernelScheme, OpenResult, StrOrBytes}, + scheme::{self, pipe, FileHandle, KernelScheme, OpenResult, SchemeExt, StrOrBytes}, sync::{CleanLockToken, RwLock}, - syscall::{data::Stat, error::*, flag::*}, + syscall::{data::{GlobalSchemes, Stat}, error::*, flag::*}, }; use super::usercopy::{UserSlice, UserSliceRo, UserSliceRw, UserSliceWo}; @@ -45,7 +45,7 @@ pub fn file_op_generic_ext( (file, desc) }; - let scheme = scheme::get_scheme(token.token(), desc.scheme)?; + let scheme = desc.get_scheme(token)?; op(&*scheme, file.description, desc, token) } @@ -62,55 +62,32 @@ pub fn copy_path_to_buf(raw_path: UserSliceRo, max_len: usize) -> Result // TODO: Define elsewhere const PATH_MAX: usize = PAGE_SIZE; -pub fn openat( - fh: FileHandle, - raw_path: UserSliceRo, +fn fifo_path_key(scheme_id: scheme::SchemeId, number: usize, path: &str) -> String { + if path.starts_with('/') { + path.to_string() + } else { + format!("@fifo:{}:{}:{}", scheme_id.get(), number, path) + } +} + +fn install_open_result( + scheme_id: scheme::SchemeId, flags: usize, - fcntl_flags: u32, - euid: u32, - egid: u32, + open_result: OpenResult, token: &mut CleanLockToken, ) -> Result { - let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?; - - let (scheme_id, number) = { - let current_lock = context::current(); - let mut current = current_lock.read(token.token()); - let (context, mut token) = current.token_split(); - let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?; - let desc = pipe.description.read(token.token()); - (desc.scheme, desc.number) - }; - - let caller_ctx = context::current() - .read(token.token()) - .caller_ctx() - .filter_uid_gid(euid, egid); - - let new_description = { - let scheme = scheme::get_scheme(token.token(), scheme_id)?; - - let res = scheme.kopenat( - number, - StrOrBytes::from_str(&path_buf), - flags, - fcntl_flags, - caller_ctx, - token, - ); - - match res? { - OpenResult::SchemeLocal(number, internal_flags) => { - Arc::new(RwLock::new(FileDescription { - offset: 0, - internal_flags, - scheme: scheme_id, - number, - flags: (flags & !O_CLOEXEC) as u32, - })) - } - OpenResult::External(desc) => desc, - } + let new_description = match open_result { + OpenResult::SchemeLocal(number, internal_flags) => Arc::new(RwLock::new( + FileDescription::new( + scheme_id, + number, + 0, + (flags & !O_CLOEXEC) as u32, + internal_flags, + token, + ), + )), + OpenResult::External(desc) => desc, }; let current_lock = context::current(); @@ -126,6 +103,102 @@ pub fn openat( ) .ok_or(Error::new(EMFILE)) } + +fn path_exists_in_scheme( + scheme: &dyn KernelScheme, + number: usize, + path: &str, + caller_ctx: scheme::CallerCtx, + token: &mut CleanLockToken, +) -> Result { + match scheme.kopenat(number, StrOrBytes::from_str(path), O_STAT, 0, caller_ctx, token) { + Ok(OpenResult::SchemeLocal(number, _)) => { + let _ = scheme.close(number, token); + Ok(true) + } + Ok(OpenResult::External(_)) => Ok(true), + Err(err) if err.errno == ENOENT => Ok(false), + Err(err) => Err(err), + } +} + +pub fn openat( + fh: FileHandle, + raw_path: UserSliceRo, + flags: usize, + fcntl_flags: u32, + euid: u32, + egid: u32, + token: &mut CleanLockToken, +) -> Result { + let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?; + + let desc = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut context_token) = current.token_split(); + let pipe = context + .get_file(fh, &mut context_token) + .ok_or(Error::new(EBADF))?; + *pipe.description.read(context_token.token()) + }; + let scheme = desc.get_scheme(token)?; + let number = desc.number; + let scheme_id = desc.scheme; + + let caller_ctx = context::current() + .read(token.token()) + .caller_ctx() + .filter_uid_gid(euid, egid); + + let fifo_mode_requested = flags & MODE_FIFO as usize == MODE_FIFO as usize; + let fifo_key = fifo_path_key(scheme_id, number, &path_buf); + + if pipe::named_pipe_exists(&fifo_key, token) { + if flags & O_EXCL == O_EXCL && flags & O_CREAT == O_CREAT { + return Err(Error::new(EEXIST)); + } + if fifo_mode_requested && flags & O_CREAT == O_CREAT { + return Err(Error::new(EEXIST)); + } + + let pipe_number = pipe::open_named_pipe(&fifo_key, flags, token)? + .ok_or(Error::new(ENOENT))?; + return install_open_result( + GlobalSchemes::Pipe.scheme_id(), + flags, + OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()), + token, + ); + } + + if fifo_mode_requested && flags & O_CREAT == O_CREAT { + if path_exists_in_scheme(&*scheme, number, &path_buf, caller_ctx, token)? { + return Err(Error::new(EEXIST)); + } + + let mode = u16::try_from(flags & 0o7777).map_err(|_| Error::new(EINVAL))?; + let pipe_number = pipe::create_named_pipe(&fifo_key, &path_buf, mode, flags, token)?; + + return install_open_result( + GlobalSchemes::Pipe.scheme_id(), + flags, + OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()), + token, + ); + } + + let open_result = scheme.kopenat( + number, + StrOrBytes::from_str(&path_buf), + flags, + fcntl_flags, + caller_ctx, + token, + )?; + + install_open_result(scheme_id, flags, open_result, token) +} /// Unlinkat syscall pub fn unlinkat( fh: FileHandle, @@ -137,22 +210,27 @@ pub fn unlinkat( ) -> Result<()> { let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?; - let (number, scheme_id) = { + let desc = { let current_lock = context::current(); let mut current = current_lock.read(token.token()); - let (context, mut token) = current.token_split(); - let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?; - let desc = pipe.description.read(token.token()); - (desc.number, desc.scheme) + let (context, mut context_token) = current.token_split(); + let pipe = context + .get_file(fh, &mut context_token) + .ok_or(Error::new(EBADF))?; + *pipe.description.read(context_token.token()) }; - - let scheme = scheme::get_scheme(token.token(), scheme_id)?; + let number = desc.number; + let scheme = desc.get_scheme(token)?; let caller_ctx = context::current() .read(token.token()) .caller_ctx() .filter_uid_gid(euid, egid); + if pipe::unlink_named_pipe(&fifo_path_key(desc.scheme, number, &path_buf), token) { + return Ok(()); + } + /* let mut path_buf = BorrowedHtBuf::head()?; let path = path_buf.use_for_string(raw_path)?; @@ -199,17 +277,18 @@ fn duplicate_file( let description = { *file.description.read(token.token()) }; let new_description = { - let scheme = scheme::get_scheme(token.token(), description.scheme)?; + let scheme = description.get_scheme(token)?; match scheme.kdup(description.number, user_buf, caller_ctx, token)? { OpenResult::SchemeLocal(number, internal_flags) => { - Arc::new(RwLock::new(FileDescription { - offset: 0, - internal_flags, - scheme: description.scheme, + Arc::new(RwLock::new(FileDescription::new( + description.scheme, number, - flags: description.flags, - })) + 0, + description.flags, + internal_flags, + token, + ))) } OpenResult::External(desc) => desc, } @@ -296,11 +375,10 @@ fn call_normal( } .ok_or(Error::new(EBADF))?; - let (scheme_id, number) = { - let desc = file.description.read(token.token()); - (desc.scheme, desc.number) + let (scheme, number) = { + let desc = *file.description.read(token.token()); + (desc.get_scheme(token)?, desc.number) }; - let scheme = scheme::get_scheme(token.token(), scheme_id)?; if flags.contains(CallFlags::STD_FS) { scheme.translate_std_fs_call(number, file.description, payload, flags, metadata, token) @@ -341,28 +419,28 @@ fn fdwrite_inner( ) -> Result { // TODO: Ensure deadlocks can't happen let (scheme, number, descs_to_send) = { - let (scheme, number) = { + let desc = { let current_lock = context::current(); let mut current = current_lock.read(token.token()); - let (context, mut token) = current.token_split(); + let (context, mut context_token) = current.token_split(); let file_descriptor = context - .get_file(socket, &mut token) + .get_file(socket, &mut context_token) .ok_or(Error::new(EBADF))?; - let desc = &file_descriptor.description.read(token.token()); - (desc.scheme, desc.number) + *file_descriptor.description.read(context_token.token()) }; - let scheme = scheme::get_scheme(token.token(), scheme)?; + let scheme = desc.get_scheme(token)?; + let number = desc.number; let current_lock = context::current(); let mut current = current_lock.read(token.token()); - let (context, mut token) = current.token_split(); + let (context, mut context_token) = current.token_split(); ( scheme, number, if flags.contains(CallFlags::FD_CLONE) { - context.bulk_get_files(&target_fds, &mut token) + context.bulk_get_files(&target_fds, &mut context_token) } else { - context.bulk_remove_files(&target_fds, &mut token) + context.bulk_remove_files(&target_fds, &mut context_token) }? .into_iter() .map(|f| f.description) @@ -395,18 +473,22 @@ fn call_fdread( metadata: &[u64], token: &mut CleanLockToken, ) -> Result { + let desc = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut context_token) = current.token_split(); + let file_descriptor = context + .get_file(fd, &mut context_token) + .ok_or(Error::new(EBADF))?; + *file_descriptor.description.read(context_token.token()) + }; let (scheme, number) = { - let (scheme, number) = { - let current_lock = context::current(); - let mut current = current_lock.read(token.token()); - let (context, mut token) = current.token_split(); - let file_descriptor = context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?; - let desc = file_descriptor.description.read(token.token()); - (desc.scheme, desc.number) - }; - let scheme = scheme::get_scheme(token.token(), scheme)?; - - (scheme, number) + let scheme = desc.get_scheme(token)?; + let number = desc.number; + ( + scheme, + number, + ) }; scheme.kfdread(number, payload, flags, metadata, token) @@ -440,9 +522,9 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken) } .ok_or(Error::new(EBADF))?; - let (scheme_id, number, flags) = { - let desc = file.description.write(token.token()); - (desc.scheme, desc.number, desc.flags) + let (number, flags, desc) = { + let desc = *file.description.read(token.token()); + (desc.number, desc.flags, desc) }; if cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC { @@ -460,7 +542,7 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken) // Communicate fcntl with scheme if cmd != F_GETFD && cmd != F_SETFD { - let scheme = scheme::get_scheme(token.token(), scheme_id)?; + let scheme = desc.get_scheme(token)?; scheme.fcntl(number, cmd, arg, token)?; }; @@ -518,13 +600,11 @@ pub fn flink(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken) let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?; let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?; - let (number, scheme_id) = { - let desc = file.description.read(token.token()); - (desc.number, desc.scheme) + let (number, scheme) = { + let desc = *file.description.read(token.token()); + (desc.number, desc.get_scheme(token)?) }; - let scheme = scheme::get_scheme(token.token(), scheme_id)?; - // TODO: Check EXDEV. /* if scheme_id != description.scheme { @@ -554,13 +634,11 @@ pub fn frename(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?; let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?; - let (number, scheme_id) = { - let desc = file.description.read(token.token()); - (desc.number, desc.scheme) + let (number, scheme) = { + let desc = *file.description.read(token.token()); + (desc.number, desc.get_scheme(token)?) }; - let scheme = scheme::get_scheme(token.token(), scheme_id)?; - // TODO: Check EXDEV. /* if scheme_id != description.scheme { diff --git a/recipes/core/kernel/source/src/syscall/mod.rs b/recipes/core/kernel/source/src/syscall/mod.rs index 450a9d112f..c7d67727d8 100644 --- a/recipes/core/kernel/source/src/syscall/mod.rs +++ b/recipes/core/kernel/source/src/syscall/mod.rs @@ -28,6 +28,11 @@ use crate::{ sync::CleanLockToken, }; +/// Local syscall numbers not yet in the redox_syscall crate. +/// These are allocated from the 987+ range to avoid collisions with crate numbers. +pub const SYS_SCHED_SETAFFINITY: usize = 987; +pub const SYS_SCHED_GETAFFINITY: usize = 988; + /// Debug pub mod debug; @@ -220,6 +225,10 @@ pub fn syscall( unlinkat(fd, UserSlice::ro(c, d)?, e, f as _, g as _, token).map(|()| 0) } SYS_YIELD => sched_yield(token).map(|()| 0), + + // P17-3: CPU affinity syscalls. Numbers allocated locally (not yet in redox_syscall crate). + SYS_SCHED_SETAFFINITY => sched_setaffinity(b, UserSlice::ro(c, d)?, token), + SYS_SCHED_GETAFFINITY => sched_getaffinity(b, UserSlice::wo(c, d)?, token), SYS_NANOSLEEP => nanosleep( UserSlice::ro(b, size_of::())?, UserSlice::wo(c, size_of::())?.none_if_null(), diff --git a/recipes/core/kernel/source/src/syscall/process.rs b/recipes/core/kernel/source/src/syscall/process.rs index e83da427b4..3edf23aa88 100644 --- a/recipes/core/kernel/source/src/syscall/process.rs +++ b/recipes/core/kernel/source/src/syscall/process.rs @@ -11,6 +11,7 @@ use crate::{ memory::{AddrSpace, Grant, PageSpan}, ContextRef, }, + cpu_set::RawMask, event, sync::{CleanLockToken, RwLock}, syscall::flag::{EventFlags, O_CREAT, O_RDWR}, @@ -271,24 +272,95 @@ unsafe fn bootstrap_mem(bootstrap: &crate::startup::Bootstrap) -> &'static [u8] } fn insert_fd(scheme: SchemeId, number: usize, cloexec: bool, token: &mut CleanLockToken) -> usize { + let description = Arc::new(RwLock::new(FileDescription::new( + scheme, + number, + 0, + (O_CREAT | O_RDWR) as u32, + InternalFlags::empty(), + token, + ))); + let current_lock = context::current(); let mut current = current_lock.read(token.token()); - let (context, mut token) = current.token_split(); + let (context, mut context_token) = current.token_split(); context .add_file_min( FileDescriptor { - description: Arc::new(RwLock::new(FileDescription { - scheme, - number, - offset: 0, - flags: (O_CREAT | O_RDWR) as u32, - internal_flags: InternalFlags::empty(), - })), + description, cloexec, }, syscall::flag::UPPER_FDTBL_TAG + scheme.get(), - &mut token, + &mut context_token, ) .expect("failed to insert fd to current context") .get() } + +/// Set CPU affinity mask for a process. +/// +/// # Arguments (syscall ABI) +/// - `pid`: Process ID (0 = current process; other PIDs not yet supported) +/// - `mask_ptr`: Pointer to a `RawMask` (32 bytes on 64-bit, 256-bit bitmap) +/// - `mask_len`: Length of mask in bytes (must equal `size_of::()`) +pub fn sched_setaffinity( + pid: usize, + mask_ptr: super::usercopy::UserSliceRo, + token: &mut CleanLockToken, +) -> Result { + // Validate mask size + if mask_ptr.len() != core::mem::size_of::() { + return Err(Error::new(super::error::EINVAL)); + } + + // pid == 0 means current process + let target = if pid == 0 { + context::current() + } else { + // TODO: Support PID-based lookup (requires context list iteration + // with lock token downgrades). For now, only pid=0 is supported. + return Err(Error::new(super::error::ESRCH)); + }; + + // Read mask from userspace + let raw_mask: RawMask = unsafe { mask_ptr.read_exact() }?; + + // Apply to context's affinity mask + let mut ctx = target.write(token.token()); + ctx.sched_affinity.override_from(&raw_mask); + + Ok(0) +} + +/// Get CPU affinity mask for a process. +/// +/// # Arguments (syscall ABI) +/// - `pid`: Process ID (0 = current process; other PIDs not yet supported) +/// - `mask_ptr`: Pointer to a `RawMask` buffer (32 bytes on 64-bit) +/// - `mask_len`: Length of buffer in bytes (must equal `size_of::()`) +/// +/// # Returns +/// Number of bytes written to mask_ptr on success. +pub fn sched_getaffinity( + pid: usize, + mask_ptr: super::usercopy::UserSliceWo, + token: &mut CleanLockToken, +) -> Result { + // Validate mask size + if mask_ptr.len() != core::mem::size_of::() { + return Err(Error::new(super::error::EINVAL)); + } + + // pid == 0 means current process + let target = if pid == 0 { + context::current() + } else { + return Err(Error::new(super::error::ESRCH)); + }; + + let ctx = target.read(token.token()); + let raw_mask = ctx.sched_affinity.to_raw(); + mask_ptr.copy_common_bytes_from_slice(crate::cpu_set::mask_as_bytes(&raw_mask))?; + + Ok(core::mem::size_of::()) +} diff --git a/recipes/wip/x11/libxcvt/source/.gitlab-ci.yml b/recipes/wip/x11/libxcvt/source/.gitlab-ci.yml new file mode 100644 index 0000000000..f165f91394 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/.gitlab-ci.yml @@ -0,0 +1,112 @@ + +##################################################### +# # +# THIS FILE IS GENERATED, DO NOT EDIT! # +# # +# Generated with "ci-fairy generate-template", edit # +# .gitlab-ci/ci.template and .gitlab-ci/config.yml # +# and rerun "ci-fairy generate-template" to change # +# this file. # +# # +##################################################### + +.templates_sha: &template_sha 3d03cccd770c04e63b40325b42223495274d6a1d + +include: + - project: 'freedesktop/ci-templates' + ref: *template_sha + file: + - '/templates/ci-fairy.yml' + - '/templates/fedora.yml' + - template: Security/SAST.gitlab-ci.yml + +stages: + - sanity check + - prep + - build + - test + +variables: + FDO_UPSTREAM_REPO: xorg/lib/libxcvt + MESON_BUILDDIR: "builddir" + NINJA_ARGS: '' + MESON_ARGS: '' + MESON_TEST_ARGS: '' + GIT_DEPTH: 1 + +.policy: + retry: + max: 2 + when: + - runner_system_failure + - stuck_or_timeout_failure + # cancel run when a newer version is pushed to the branch + interruptible: true + + +# Re-generate the CI script and make sure it's the one currently checked in +# If this job fails, re-generate the gitlab-ci.yml script, see +# $SRCDIR/.gitlab-ci/generate-gitlab-ci.py +# +check-ci-script: + extends: + - .fdo.ci-fairy + stage: sanity check + script: + - ci-fairy generate-template --verify && exit 0 || true + - echo "Committed gitlab-ci.yml differs from generated gitlab-ci.yml. Please verify" + - exit 1 + +# +# Verify that commit messages are as expected, signed-off, etc. +# +check-commit: + extends: + - .fdo.ci-fairy + stage: sanity check + script: + - ci-fairy check-commits --signed-off-by --junit-xml=results.xml + except: + - master@xorg/lib/libxcvt + variables: + GIT_DEPTH: 100 + artifacts: + reports: + junit: results.xml + +# +# Verify that merge request has the "allow collaboration" checkbox ticked +# +check-merge-request: + extends: + - .fdo.ci-fairy + stage: sanity check + script: + - ci-fairy check-merge-request --require-allow-collaboration --junit-xml=results.xml + artifacts: + when: on_failure + reports: + junit: results.xml + allow_failure: true + + +.fedora.34: + variables: + FDO_DISTRIBUTION_VERSION: '34' + FDO_DISTRIBUTION_TAG: '2022-08-03.0' + +prep-fedora-34: + extends: + - .fdo.container-build@fedora + - .fedora.34 + stage: prep + variables: + FDO_DISTRIBUTION_PACKAGES: "meson gcc" + +build-fedora-34: + extends: + - .fdo.distribution-image@fedora + - .fedora.34 + stage: build + script: + - .gitlab-ci/meson-build.sh diff --git a/recipes/wip/x11/libxcvt/source/.gitlab-ci/ci.template b/recipes/wip/x11/libxcvt/source/.gitlab-ci/ci.template new file mode 100644 index 0000000000..3728fcc786 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/.gitlab-ci/ci.template @@ -0,0 +1,118 @@ +{# You're looking at the template here, so you can ignore the below + warning. This is the right file to edit #} + +##################################################### +# # +# THIS FILE IS GENERATED, DO NOT EDIT! # +# # +# Generated with "ci-fairy generate-template", edit # +# .gitlab-ci/ci.template and .gitlab-ci/config.yml # +# and rerun "ci-fairy generate-template" to change # +# this file. # +# # +##################################################### + +.templates_sha: &template_sha 3d03cccd770c04e63b40325b42223495274d6a1d + +include: + - project: 'freedesktop/ci-templates' + ref: *template_sha + file: + - '/templates/ci-fairy.yml' + {% for d in distributions %} + - '/templates/{{d.name}}.yml' + {% endfor %} + - template: Security/SAST.gitlab-ci.yml + +stages: + - sanity check + - prep + - build + - test + +variables: + FDO_UPSTREAM_REPO: xorg/lib/libxcvt + MESON_BUILDDIR: "builddir" + NINJA_ARGS: '' + MESON_ARGS: '' + MESON_TEST_ARGS: '' + GIT_DEPTH: 1 + +.policy: + retry: + max: 2 + when: + - runner_system_failure + - stuck_or_timeout_failure + # cancel run when a newer version is pushed to the branch + interruptible: true + + +# Re-generate the CI script and make sure it's the one currently checked in +# If this job fails, re-generate the gitlab-ci.yml script, see +# $SRCDIR/.gitlab-ci/generate-gitlab-ci.py +# +check-ci-script: + extends: + - .fdo.ci-fairy + stage: sanity check + script: + - ci-fairy generate-template --verify && exit 0 || true + - echo "Committed gitlab-ci.yml differs from generated gitlab-ci.yml. Please verify" + - exit 1 + +# +# Verify that commit messages are as expected, signed-off, etc. +# +check-commit: + extends: + - .fdo.ci-fairy + stage: sanity check + script: + - ci-fairy check-commits --signed-off-by --junit-xml=results.xml + except: + - master@xorg/lib/libxcvt + variables: + GIT_DEPTH: 100 + artifacts: + reports: + junit: results.xml + +# +# Verify that merge request has the "allow collaboration" checkbox ticked +# +check-merge-request: + extends: + - .fdo.ci-fairy + stage: sanity check + script: + - ci-fairy check-merge-request --require-allow-collaboration --junit-xml=results.xml + artifacts: + when: on_failure + reports: + junit: results.xml + allow_failure: true + +{% for d in distributions %} + +.{{d.name}}.{{d.version}}: + variables: + FDO_DISTRIBUTION_VERSION: '{{d.version}}' + FDO_DISTRIBUTION_TAG: '{{d.tag}}' + +prep-{{d.name}}-{{d.version}}: + extends: + - .fdo.container-build@{{d.name}} + - .{{d.name}}.{{d.version}} + stage: prep + variables: + FDO_DISTRIBUTION_PACKAGES: "{{' '.join(d.packages)}}" + +build-{{d.name}}-{{d.version}}: + extends: + - .fdo.distribution-image@{{d.name}} + - .{{d.name}}.{{d.version}} + stage: build + script: + - .gitlab-ci/meson-build.sh +{% endfor %} diff --git a/recipes/wip/x11/libxcvt/source/.gitlab-ci/config.yml b/recipes/wip/x11/libxcvt/source/.gitlab-ci/config.yml new file mode 100644 index 0000000000..a17c3e7918 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/.gitlab-ci/config.yml @@ -0,0 +1,9 @@ +.default_tag: &default_tag '2022-08-03.0' + +distributions: + - name: fedora + tag: *default_tag + version: 34 + packages: + - meson + - gcc diff --git a/recipes/wip/x11/libxcvt/source/.gitlab-ci/meson-build.sh b/recipes/wip/x11/libxcvt/source/.gitlab-ci/meson-build.sh new file mode 100755 index 0000000000..50b84b0182 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/.gitlab-ci/meson-build.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +if [[ -f .meson_environment ]]; then + . .meson_environment +fi + +if [[ -z "$MESON_BUILDDIR" ]]; then + echo "\$MESON_BUILDDIR undefined." + exit 1 +fi + +# emulate a few gitlab variables to make it easier to +# run and debug locally. +if [[ -z "$CI_JOB_ID" ]] || [[ -z "$CI_JOB_NAME" ]] || [[ -z "$CI_PROJECT_NAME" ]]; then + echo "Missing \$CI_JOB_ID or \$CI_JOB_NAME". + CI_PROJECT_NAME=$(basename $PWD) + CI_JOB_ID=$(date +%s) + CI_JOB_NAME='${CI_PROJECT_NAME}-job-local' + echo "Simulating gitlab environment: " + echo " CI_JOB_ID=$CI_JOB_ID" + echo " CI_JOB_NAME=$CI_JOB_NAME" + echo " CI_PROJECT_NAME=$CI_PROJECT_NAME" +fi + + +echo "*************************************************" +echo "builddir: $MESON_BUILDDIR" +echo "meson args: $MESON_ARGS" +echo "ninja args: $NINJA_ARGS" +echo "meson test args: $MESON_TEST_ARGS" +echo "*************************************************" + +set -e + +rm -rf "$MESON_BUILDDIR" +meson "$MESON_BUILDDIR" $MESON_ARGS +meson configure "$MESON_BUILDDIR" +ninja -C "$MESON_BUILDDIR" $NINJA_ARGS + +if [[ -z "$MESON_TEST_ARGS" ]]; then + exit 0 +fi + +# we still want to generate the reports, even if meson test fails +meson test -C "$MESON_BUILDDIR" $MESON_TEST_ARGS --print-errorlogs +exit_code=$? + +exit $exit_code diff --git a/recipes/wip/x11/libxcvt/source/COPYING b/recipes/wip/x11/libxcvt/source/COPYING new file mode 100644 index 0000000000..274db76b26 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/COPYING @@ -0,0 +1,67 @@ +Copyright 2005-2006 Luc Verhaegen. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + + + +Copyright 2005-2006 Luc Verhaegen. +Copyright © 2021 Red Hat, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + + + +Copyright © 2000 Compaq Computer Corporation +Copyright © 2002 Hewlett Packard Company +Copyright © 2006 Intel Corporation +Copyright © 2008, 2021 Red Hat, Inc. + +Permission to use, copy, modify, distribute, and sell this software and its +documentation for any purpose is hereby granted without fee, provided that +the above copyright notice appear in all copies and that both that copyright +notice and this permission notice appear in supporting documentation, and +that the name of the copyright holders not be used in advertising or +publicity pertaining to distribution of the software without specific, +written prior permission. The copyright holders make no representations +about the suitability of this software for any purpose. It is provided "as +is" without express or implied warranty. + +THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, +INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO +EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR +CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +OF THIS SOFTWARE. diff --git a/recipes/wip/x11/libxcvt/source/README.md b/recipes/wip/x11/libxcvt/source/README.md new file mode 100644 index 0000000000..99571bc63c --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/README.md @@ -0,0 +1,36 @@ +libxcvt +======= + +`libxcvt` is a library providing a standalone version of the X server +implementation of the VESA CVT standard timing modelines generator. + +`libxcvt` also provides a standalone version of the command line tool +`cvt` copied from the Xorg implementation and is meant to be a direct +replacement to the version provided by the `Xorg` server. + +An example output is: + +``` +$ cvt --verbose 1920 1200 75 +# 1920x1200 74.93 Hz (CVT 2.30MA) hsync: 94.04 kHz; pclk: 245.25 MHz +Modeline "1920x1200_75.00" 245.25 1920 2064 2264 2608 1200 1203 1209 1255 -hsync +vsync +``` + +Building +======== + +`libxcvt` is built using [Meson](https://mesonbuild.com/) + + $ git clone https://gitlab.freedesktop.org/xorg/lib/libxcvt.git + $ cd libxcvt + $ meson build/ --prefix=... + $ ninja -C build/ install + $ cd .. + +Credit +====== + +The code base of `libxcvt` is identical to `xf86CVTMode()` therefore +all credits for `libxcvt` go to the author (Luc Verhaegen) and +contributors of `xf86CVTMode()` and the `cvt` utility as found in the +[xserver](https://gitlab.freedesktop.org/xorg/xserver/) repository. diff --git a/recipes/wip/x11/libxcvt/source/cvt/cvt.c b/recipes/wip/x11/libxcvt/source/cvt/cvt.c new file mode 100644 index 0000000000..90c0c8d045 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/cvt/cvt.c @@ -0,0 +1,257 @@ +/* + * Copyright 2005-2006 Luc Verhaegen. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/* Standalone VESA CVT standard timing modelines generator. */ + +#include +#include +#include +#include + +#include + +static bool +cvt_is_standard(int hdisplay, int vdisplay, float vrefresh, bool reduced, bool verbose) +{ + bool is_cvt = true; + + if ((!(vdisplay % 3) && ((vdisplay * 4 / 3) == hdisplay)) || + (!(vdisplay % 9) && ((vdisplay * 16 / 9) == hdisplay)) || + (!(vdisplay % 10) && ((vdisplay * 16 / 10) == hdisplay)) || + (!(vdisplay % 4) && ((vdisplay * 5 / 4) == hdisplay)) || + (!(vdisplay % 9) && ((vdisplay * 15 / 9) == hdisplay))); + else { + if (verbose) + fprintf(stderr, "Warning: Aspect Ratio is not CVT standard.\n"); + is_cvt = false; + } + + if ((vrefresh != 50.0) && (vrefresh != 60.0) && + (vrefresh != 75.0) && (vrefresh != 85.0)) { + if (verbose) + fprintf(stderr, "Warning: Refresh Rate %.2f is not CVT standard " + "(50, 60, 75 or 85Hz).\n", vrefresh); + is_cvt = false; + } + + return is_cvt; +} +/* + * I'm not documenting --interlaced for obvious reasons, even though I did + * implement it. I also can't deny having looked at gtf here. + */ +static void +print_usage(char *Name) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "usage: %s [-v|--verbose] [-r|--reduced] X Y [refresh]\n", + Name); + fprintf(stderr, "\n"); + fprintf(stderr, " -v|--verbose : Warn about CVT standard adherence.\n"); + fprintf(stderr, " -r|--reduced : Create a mode with reduced blanking " + "(default: normal blanking).\n"); + fprintf(stderr, " X : Desired horizontal resolution " + "(multiple of 8, required).\n"); + fprintf(stderr, + " Y : Desired vertical resolution (required).\n"); + fprintf(stderr, + " refresh : Desired refresh rate (default: 60.0Hz).\n"); + fprintf(stderr, "\n"); + + fprintf(stderr, "Calculates VESA CVT (Coordinated Video Timing) modelines" + " for use with X.\n"); +} + +/* + * + */ +static void +print_comment(struct libxcvt_mode_info *mode_info, bool is_cvt, bool reduced) +{ + printf("# %dx%d %.2f Hz ", mode_info->hdisplay, mode_info->vdisplay, mode_info->vrefresh); + + if (is_cvt) { + printf("(CVT %.2fM", + ((float) mode_info->hdisplay * mode_info->vdisplay) / 1000000.0); + + if (!(mode_info->vdisplay % 3) && + ((mode_info->vdisplay * 4 / 3) == mode_info->hdisplay)) + printf("3"); + else if (!(mode_info->vdisplay % 9) && + ((mode_info->vdisplay * 16 / 9) == mode_info->hdisplay)) + printf("9"); + else if (!(mode_info->vdisplay % 10) && + ((mode_info->vdisplay * 16 / 10) == mode_info->hdisplay)) + printf("A"); + else if (!(mode_info->vdisplay % 4) && + ((mode_info->vdisplay * 5 / 4) == mode_info->hdisplay)) + printf("4"); + else if (!(mode_info->vdisplay % 9) && + ((mode_info->vdisplay * 15 / 9) == mode_info->hdisplay)) + printf("9"); + + if (reduced) + printf("-R"); + + printf(") "); + } + else + printf("(CVT) "); + + printf("hsync: %.2f kHz; ", mode_info->hsync); + printf("pclk: %.2f MHz", ((float) mode_info->dot_clock) / 1000.0); + + printf("\n"); +} + +/* + * Originally grabbed from xf86Mode.c. + * + * Ignoring the actual mode_info->name, as the user will want something solid + * to grab hold of. + */ +static void +print_mode_line(struct libxcvt_mode_info *mode_info, int hdisplay, int vdisplay, float vrefresh, + bool reduced) +{ + if (reduced) + printf("Modeline \"%dx%dR\" ", hdisplay, vdisplay); + else + printf("Modeline \"%dx%d_%.2f\" ", hdisplay, vdisplay, vrefresh); + + printf("%6.2f %i %i %i %i %i %i %i %i", mode_info->dot_clock / 1000., + mode_info->hdisplay, mode_info->hsync_start, mode_info->hsync_end, mode_info->htotal, + mode_info->vdisplay, mode_info->vsync_start, mode_info->vsync_end, mode_info->vtotal); + + if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_INTERLACE) + printf(" interlace"); + if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_HSYNC_POSITIVE) + printf(" +hsync"); + if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_HSYNC_NEGATIVE) + printf(" -hsync"); + if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_VSYNC_POSITIVE) + printf(" +vsync"); + if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_VSYNC_NEGATIVE) + printf(" -vsync"); + + printf("\n"); +} + +/* + * + */ +int +main(int argc, char *argv[]) +{ + struct libxcvt_mode_info *mode_info; + int hdisplay = 0, vdisplay = 0; + float vrefresh = 0.0; + bool reduced = false, verbose = false, is_cvt; + bool interlaced = false; + int n; + + if ((argc < 3) || (argc > 7)) { + print_usage(argv[0]); + return 1; + } + + /* This doesn't filter out bad flags properly. Bad flags get passed down + * to atoi/atof, which then return 0, so that these variables can get + * filled next time round. So this is just a cosmetic problem. + */ + for (n = 1; n < argc; n++) { + if (!strcmp(argv[n], "-r") || !strcmp(argv[n], "--reduced")) + reduced = true; + else if (!strcmp(argv[n], "-i") || !strcmp(argv[n], "--interlaced")) + interlaced = true; + else if (!strcmp(argv[n], "-v") || !strcmp(argv[n], "--verbose")) + verbose = true; + else if (!strcmp(argv[n], "-h") || !strcmp(argv[n], "--help")) { + print_usage(argv[0]); + return 0; + } + else if (!hdisplay) { + hdisplay = atoi(argv[n]); + if (!hdisplay) { + print_usage(argv[0]); + return 1; + } + } + else if (!vdisplay) { + vdisplay = atoi(argv[n]); + if (!vdisplay) { + print_usage(argv[0]); + return 1; + } + } + else if (!vrefresh) { + vrefresh = atof(argv[n]); + if (!vrefresh) { + print_usage(argv[0]); + return 1; + } + } + else { + print_usage(argv[0]); + return 1; + } + } + + if (!hdisplay || !vdisplay) { + print_usage(argv[0]); + return 0; + } + + /* Default to 60.0Hz */ + if (!vrefresh) + vrefresh = 60.0; + + /* Horizontal timing is always a multiple of 8: round up. */ + if (hdisplay & 0x07) { + hdisplay &= ~0x07; + hdisplay += 8; + } + + if (reduced) { + if ((vrefresh / 60.0) != floor(vrefresh / 60.0)) { + fprintf(stderr, + "\nERROR: Multiple of 60Hz refresh rate required for " + " reduced blanking.\n"); + print_usage(argv[0]); + return 0; + } + } + + mode_info = libxcvt_gen_mode_info(hdisplay, vdisplay, vrefresh, reduced, interlaced); + if (!mode_info) { + fprintf(stderr, "Out of memory!\n"); + return 0; + } + + is_cvt = cvt_is_standard(hdisplay, vdisplay, vrefresh, reduced, verbose); + print_comment(mode_info, is_cvt, reduced); + print_mode_line(mode_info, hdisplay, vdisplay, vrefresh, reduced); + free(mode_info); + + return 0; +} diff --git a/recipes/wip/x11/libxcvt/source/cvt/meson.build b/recipes/wip/x11/libxcvt/source/cvt/meson.build new file mode 100644 index 0000000000..5262471960 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/cvt/meson.build @@ -0,0 +1,10 @@ +cvt_src = [ + 'cvt.c', +] + +executable('cvt', + cvt_src, + include_directories : inc, + link_with : libxcvt, + dependencies: mdep, + install : true) diff --git a/recipes/wip/x11/libxcvt/source/include/libxcvt/libxcvt.h b/recipes/wip/x11/libxcvt/source/include/libxcvt/libxcvt.h new file mode 100644 index 0000000000..051a871288 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/include/libxcvt/libxcvt.h @@ -0,0 +1,46 @@ +/* + * Copyright 2005-2006 Luc Verhaegen. + * Copyright © 2021 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef _LIBCVT_H_ +#define _LIBCVT_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct libxcvt_mode_info * +libxcvt_gen_mode_info(int hdisplay, + int vdisplay, + float vrefresh, + bool reduced, + bool interlaced); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBCVT_H_ */ diff --git a/recipes/wip/x11/libxcvt/source/include/libxcvt/libxcvt_mode.h b/recipes/wip/x11/libxcvt/source/include/libxcvt/libxcvt_mode.h new file mode 100644 index 0000000000..f29739e73c --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/include/libxcvt/libxcvt_mode.h @@ -0,0 +1,56 @@ +/* + * Copyright © 2000 Compaq Computer Corporation + * Copyright © 2002 Hewlett Packard Company + * Copyright © 2006 Intel Corporation + * Copyright © 2008, 2021 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that copyright + * notice and this permission notice appear in supporting documentation, and + * that the name of the copyright holders not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. The copyright holders make no representations + * about the suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THIS SOFTWARE. + * + */ + +#ifndef _LIBXCVT_MODE_H_ +#define _LIBXCVT_MODE_H_ + +#include + +/* Conveniently chosen to match the RandR definitions */ +enum libxcvt_mode_flags { + LIBXCVT_MODE_FLAG_HSYNC_POSITIVE = (1 << 0), + LIBXCVT_MODE_FLAG_HSYNC_NEGATIVE = (1 << 1), + LIBXCVT_MODE_FLAG_VSYNC_POSITIVE = (1 << 2), + LIBXCVT_MODE_FLAG_VSYNC_NEGATIVE = (1 << 3), + LIBXCVT_MODE_FLAG_INTERLACE = (1 << 4), +}; + +struct libxcvt_mode_info { + uint32_t hdisplay; + uint32_t vdisplay; + float vrefresh; + float hsync; + uint64_t dot_clock; + uint16_t hsync_start; + uint16_t hsync_end; + uint16_t htotal; + uint16_t vsync_start; + uint16_t vsync_end; + uint16_t vtotal; + enum libxcvt_mode_flags mode_flags; +}; + +#endif /* _LIBXCVT_MODE_H_ */ diff --git a/recipes/wip/x11/libxcvt/source/include/libxcvt/meson.build b/recipes/wip/x11/libxcvt/source/include/libxcvt/meson.build new file mode 100644 index 0000000000..fde2cbe05c --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/include/libxcvt/meson.build @@ -0,0 +1 @@ +install_headers('libxcvt.h','libxcvt_mode.h', subdir: 'libxcvt') diff --git a/recipes/wip/x11/libxcvt/source/include/meson.build b/recipes/wip/x11/libxcvt/source/include/meson.build new file mode 100644 index 0000000000..4fbe7bddb6 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/include/meson.build @@ -0,0 +1 @@ +subdir('libxcvt') diff --git a/recipes/wip/x11/libxcvt/source/lib/libxcvt.c b/recipes/wip/x11/libxcvt/source/lib/libxcvt.c new file mode 100644 index 0000000000..003c8221d1 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/lib/libxcvt.c @@ -0,0 +1,301 @@ +/* + * Copyright 2005-2006 Luc Verhaegen. + * Copyright © 2021 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/* Standalone VESA CVT standard timing modelines generator. */ + + +#include +#include +#include + +#include + +/* + * Generate a CVT standard mode from hdisplay, vdisplay and vrefresh. + * + * These calculations are stolen from the CVT calculation spreadsheet written + * by Graham Loveridge. He seems to be claiming no copyright and there seems to + * be no license attached to this. He apparently just wants to see his name + * mentioned. + * + * This file can be found at http://www.vesa.org/Public/CVT/CVTd6r1.xls + * + * Comments and structure corresponds to the comments and structure of the xls. + * This should ease importing of future changes to the standard (not very + * likely though). + * + * About margins; i'm sure that they are to be the bit between HDisplay and + * HBlankStart, HBlankEnd and HTotal, VDisplay and VBlankStart, VBlankEnd and + * VTotal, where the overscan colour is shown. FB seems to call _all_ blanking + * outside sync "margin" for some reason. Since we prefer seeing proper + * blanking instead of the overscan colour, and since the Crtc* values will + * probably get altered after us, we will disable margins altogether. With + * these calculations, Margins will plainly expand H/VDisplay, and we don't + * want that. -- libv + * + */ +struct libxcvt_mode_info * +libxcvt_gen_mode_info(int hdisplay, int vdisplay, float vrefresh, bool reduced, bool interlaced) +{ + bool margins = false; + float vfield_rate, hperiod; + int hdisplay_rnd, hmargin; + int vdisplay_rnd, vmargin, vsync; + float interlace; /* Please rename this */ + struct libxcvt_mode_info *mode_info; + + mode_info = calloc(1, sizeof *mode_info); + if (!mode_info) + return NULL; + + mode_info->hdisplay = hdisplay; + mode_info->vdisplay = vdisplay; + mode_info->vrefresh = vrefresh; + + /* 1) top/bottom margin size (% of height) - default: 1.8 */ +#define CVT_MARGIN_PERCENTAGE 1.8 + + /* 2) character cell horizontal granularity (pixels) - default 8 */ +#define CVT_H_GRANULARITY 8 + + /* 4) Minimum vertical front porch (lines) - default 3 */ +#define CVT_MIN_V_PORCH_RND 3 + + /* 4) Minimum number of vertical back porch lines - default 6 */ +#define CVT_MIN_V_BPORCH 6 + + /* Pixel Clock step (kHz) */ +#define CVT_CLOCK_STEP 250 + + /* CVT default is 60.0Hz */ + if (!mode_info->vrefresh) + mode_info->vrefresh = 60.0; + + /* 1. Required field rate */ + if (interlaced) + vfield_rate = mode_info->vrefresh * 2; + else + vfield_rate = mode_info->vrefresh; + + /* 2. Horizontal pixels */ + hdisplay_rnd = mode_info->hdisplay - (mode_info->hdisplay % CVT_H_GRANULARITY); + + /* 3. Determine left and right borders */ + if (margins) { + /* right margin is actually exactly the same as left */ + hmargin = (((float) hdisplay_rnd) * CVT_MARGIN_PERCENTAGE / 100.0); + hmargin -= hmargin % CVT_H_GRANULARITY; + } + else { + hmargin = 0; + } + + /* 4. Find total active pixels */ + mode_info->hdisplay = hdisplay_rnd + 2 * hmargin; + + /* 5. Find number of lines per field */ + if (interlaced) + vdisplay_rnd = mode_info->vdisplay / 2; + else + vdisplay_rnd = mode_info->vdisplay; + + /* 6. Find top and bottom margins */ + /* nope. */ + if (margins) + /* top and bottom margins are equal again. */ + vmargin = (((float) vdisplay_rnd) * CVT_MARGIN_PERCENTAGE / 100.0); + else + vmargin = 0; + + mode_info->vdisplay = mode_info->vdisplay + 2 * vmargin; + + /* 7. interlace */ + if (interlaced) + interlace = 0.5; + else + interlace = 0.0; + + /* Determine vsync Width from aspect ratio */ + if (!(mode_info->vdisplay % 3) && ((mode_info->vdisplay * 4 / 3) == mode_info->hdisplay)) + vsync = 4; + else if (!(mode_info->vdisplay % 9) && ((mode_info->vdisplay * 16 / 9) == mode_info->hdisplay)) + vsync = 5; + else if (!(mode_info->vdisplay % 10) && ((mode_info->vdisplay * 16 / 10) == mode_info->hdisplay)) + vsync = 6; + else if (!(mode_info->vdisplay % 4) && ((mode_info->vdisplay * 5 / 4) == mode_info->hdisplay)) + vsync = 7; + else if (!(mode_info->vdisplay % 9) && ((mode_info->vdisplay * 15 / 9) == mode_info->hdisplay)) + vsync = 7; + else /* Custom */ + vsync = 10; + + if (!reduced) { /* simplified GTF calculation */ + + /* 4) Minimum time of vertical sync + back porch interval (µs) + * default 550.0 */ +#define CVT_MIN_VSYNC_BP 550.0 + + /* 3) Nominal HSync width (% of line period) - default 8 */ +#define CVT_HSYNC_PERCENTAGE 8 + + float hblank_percentage; + int vsync_and_back_porch, vback_porch; + int hblank, hsync_w; + + /* 8. Estimated Horizontal period */ + hperiod = ((float) (1000000.0 / vfield_rate - CVT_MIN_VSYNC_BP)) / + (vdisplay_rnd + 2 * vmargin + CVT_MIN_V_PORCH_RND + interlace); + + /* 9. Find number of lines in sync + backporch */ + if (((int) (CVT_MIN_VSYNC_BP / hperiod) + 1) < + (vsync + CVT_MIN_V_BPORCH)) + vsync_and_back_porch = vsync + CVT_MIN_V_BPORCH; + else + vsync_and_back_porch = (int) (CVT_MIN_VSYNC_BP / hperiod) + 1; + + /* 10. Find number of lines in back porch */ + vback_porch = vsync_and_back_porch - vsync; + (void) vback_porch; + + /* 11. Find total number of lines in vertical field */ + mode_info->vtotal = + vdisplay_rnd + 2 * vmargin + vsync_and_back_porch + interlace + + CVT_MIN_V_PORCH_RND; + + /* 5) Definition of Horizontal blanking time limitation */ + /* Gradient (%/kHz) - default 600 */ +#define CVT_M_FACTOR 600 + + /* Offset (%) - default 40 */ +#define CVT_C_FACTOR 40 + + /* Blanking time scaling factor - default 128 */ +#define CVT_K_FACTOR 128 + + /* Scaling factor weighting - default 20 */ +#define CVT_J_FACTOR 20 + +#define CVT_M_PRIME CVT_M_FACTOR * CVT_K_FACTOR / 256 +#define CVT_C_PRIME (CVT_C_FACTOR - CVT_J_FACTOR) * CVT_K_FACTOR / 256 + \ + CVT_J_FACTOR + + /* 12. Find ideal blanking duty cycle from formula */ + hblank_percentage = CVT_C_PRIME - CVT_M_PRIME * hperiod / 1000.0; + + /* 13. Blanking time */ + if (hblank_percentage < 20) + hblank_percentage = 20; + + hblank = mode_info->hdisplay * hblank_percentage / (100.0 - hblank_percentage); + hblank -= hblank % (2 * CVT_H_GRANULARITY); + + /* 14. Find total number of pixels in a line. */ + mode_info->htotal = mode_info->hdisplay + hblank; + + /* Fill in HSync values */ + mode_info->hsync_end = mode_info->hdisplay + hblank / 2; + + hsync_w = (mode_info->htotal * CVT_HSYNC_PERCENTAGE) / 100; + hsync_w -= hsync_w % CVT_H_GRANULARITY; + mode_info->hsync_start = mode_info->hsync_end - hsync_w; + + /* Fill in vsync values */ + mode_info->vsync_start = mode_info->vdisplay + CVT_MIN_V_PORCH_RND; + mode_info->vsync_end = mode_info->vsync_start + vsync; + + } + else { /* reduced blanking */ + /* Minimum vertical blanking interval time (µs) - default 460 */ +#define CVT_RB_MIN_VBLANK 460.0 + + /* Fixed number of clocks for horizontal sync */ +#define CVT_RB_H_SYNC 32.0 + + /* Fixed number of clocks for horizontal blanking */ +#define CVT_RB_H_BLANK 160.0 + + /* Fixed number of lines for vertical front porch - default 3 */ +#define CVT_RB_VFPORCH 3 + + int vblank_interval_lines; + + /* 8. Estimate Horizontal period. */ + hperiod = ((float) (1000000.0 / vfield_rate - CVT_RB_MIN_VBLANK)) / + (vdisplay_rnd + 2 * vmargin); + + /* 9. Find number of lines in vertical blanking */ + vblank_interval_lines = ((float) CVT_RB_MIN_VBLANK) / hperiod + 1; + + /* 10. Check if vertical blanking is sufficient */ + if (vblank_interval_lines < (CVT_RB_VFPORCH + vsync + CVT_MIN_V_BPORCH)) + vblank_interval_lines = CVT_RB_VFPORCH + vsync + CVT_MIN_V_BPORCH; + + /* 11. Find total number of lines in vertical field */ + mode_info->vtotal = vdisplay_rnd + 2 * vmargin + interlace + vblank_interval_lines; + + /* 12. Find total number of pixels in a line */ + mode_info->htotal = mode_info->hdisplay + CVT_RB_H_BLANK; + + /* Fill in HSync values */ + mode_info->hsync_end = mode_info->hdisplay + CVT_RB_H_BLANK / 2; + mode_info->hsync_start = mode_info->hsync_end - CVT_RB_H_SYNC; + + /* Fill in vsync values */ + mode_info->vsync_start = mode_info->vdisplay + CVT_RB_VFPORCH; + mode_info->vsync_end = mode_info->vsync_start + vsync; + } + + /* 15/13. Find pixel clock frequency (kHz for xf86) */ + mode_info->dot_clock = mode_info->htotal * 1000.0 / hperiod; + mode_info->dot_clock -= mode_info->dot_clock % CVT_CLOCK_STEP; + + /* 16/14. Find actual Horizontal Frequency (kHz) */ + mode_info->hsync = ((float) mode_info->dot_clock) / ((float) mode_info->htotal); + + /* 17/15. Find actual Field rate */ + mode_info->vrefresh = (1000.0 * ((float) mode_info->dot_clock)) / + ((float) (mode_info->htotal * mode_info->vtotal)); + + /* 18/16. Find actual vertical frame frequency */ + /* ignore - just set the mode flag for interlaced */ + if (interlaced) + mode_info->vtotal *= 2; + + if (reduced) + mode_info->mode_flags |= LIBXCVT_MODE_FLAG_HSYNC_POSITIVE | LIBXCVT_MODE_FLAG_VSYNC_NEGATIVE; + else + mode_info->mode_flags |= LIBXCVT_MODE_FLAG_HSYNC_NEGATIVE | LIBXCVT_MODE_FLAG_VSYNC_POSITIVE; + + if (interlaced) + mode_info->mode_flags |= LIBXCVT_MODE_FLAG_INTERLACE; + + /* FWXGA hack adapted from hw/xfree86/modes/xf86EdidModes.c, because you can't say 1366 */ + if (mode_info->hdisplay == 1360 && mode_info->vdisplay == 768) { + mode_info->hdisplay = 1366; + mode_info->hsync_start--; + mode_info->hsync_end--; + } + + return mode_info; +} diff --git a/recipes/wip/x11/libxcvt/source/lib/meson.build b/recipes/wip/x11/libxcvt/source/lib/meson.build new file mode 100644 index 0000000000..5d8d5150b6 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/lib/meson.build @@ -0,0 +1,7 @@ +libxcvt_sources = ['libxcvt.c'] +libxcvt = shared_library('xcvt', + libxcvt_sources, + include_directories : inc, + version: meson.project_version(), + darwin_versions : ['1.0.0', '1.0.0' ], + install : true) diff --git a/recipes/wip/x11/libxcvt/source/man/cvt.man b/recipes/wip/x11/libxcvt/source/man/cvt.man new file mode 100644 index 0000000000..009ab6540b --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/man/cvt.man @@ -0,0 +1,41 @@ +.TH CVT 1 @vendorversion@ +.SH NAME +cvt - calculate VESA CVT mode lines +.SH SYNOPSIS +.B cvt +.RB [ \-v | \-\-verbose ] +.RB [ \-r | \-\-reduced ] +.I h-resolution +.I v-resolution +.RB [ refresh ] +.SH DESCRIPTION +.I Cvt +is a utility for calculating VESA Coordinated Video Timing modes. Given the +desired horizontal and vertical resolutions, a modeline adhering to the CVT +standard is printed. This modeline can be included in Xorg +.B xorg.conf(@filemansuffix@) +. + +.SH OPTIONS +.TP 8 +.BR refresh +Provide a vertical refresh rate in Hz. The CVT standard prefers either 50.0, +60.0, 75.0 or 85.0Hz. The default is 60.0Hz. +.TP 8 +.BR \-v | \-\-verbose +Warn verbosely when a given mode does not completely correspond with CVT +standards. +.TP 8 +.BR \-r | \-\-reduced +Create a mode with reduced blanking. This allows for higher frequency signals, +with a lower or equal dotclock. Not for Cathode Ray Tube based displays though. + +.SH "SEE ALSO" +xorg.conf(@filemansuffix@), gtf(@appmansuffix@) +.SH AUTHOR +Luc Verhaegen. +.PP +This program is based on the Coordinated Video Timing sample +implementation written by Graham Loveridge. This file is publicly +available at . CVT is a +VESA trademark. diff --git a/recipes/wip/x11/libxcvt/source/man/meson.build b/recipes/wip/x11/libxcvt/source/man/meson.build new file mode 100644 index 0000000000..5234b162c4 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/man/meson.build @@ -0,0 +1,12 @@ +man_conf = configuration_data() +man_conf.set('appmansuffix', '1') +man_conf.set('filemansuffix', '5') +man_conf.set('vendorversion', + '"libxcvt @0@" "X Version 11"'.format(meson.project_version())) + +configure_file( + input: 'cvt.man', + output: 'cvt.1', + install_dir: join_paths(man, 'man1'), + configuration: man_conf +) diff --git a/recipes/wip/x11/libxcvt/source/meson.build b/recipes/wip/x11/libxcvt/source/meson.build new file mode 100644 index 0000000000..b93104cf48 --- /dev/null +++ b/recipes/wip/x11/libxcvt/source/meson.build @@ -0,0 +1,28 @@ +project('libxcvt', 'c', + version: '0.1.3', + meson_version: '>= 0.40.0', + default_options: ['warning_level=1', + 'buildtype=debugoptimized']) + +libcvt_version = meson.project_version().split('.') + +cc = meson.get_compiler('c') +mdep = cc.find_library('m', required : false) + +prefix = get_option('prefix') +inc = include_directories('include') +man = join_paths(prefix, get_option('mandir')) + +subdir('include') +subdir('lib') +subdir('cvt') +subdir('man') + +pkg_mod = import('pkgconfig') +pkg_mod.generate(libraries : libxcvt, + version : meson.project_version(), + name : 'libxcvt', + description : 'A Library to generate VESA CVT standard timing modelines.') + +libxcvt_dep = declare_dependency(link_with: libxcvt, + include_directories: inc)