diff --git a/config/redbear-full.toml b/config/redbear-full.toml index 1743ec7a..42704b8e 100644 --- a/config/redbear-full.toml +++ b/config/redbear-full.toml @@ -24,10 +24,16 @@ shell = "/usr/bin/ion" shell = "/usr/bin/zsh" [packages] +# Runtime driver parameter control surface. +driver-params = {} + # Firmware loading redbear-firmware = {} firmware-loader = {} +# NUMA topology discovery (userspace daemon) +numad = {} + # GPU/graphics stack redox-drm = {} mesa = {} @@ -400,3 +406,4 @@ subclass = 0x00 command = ["redox-drm"] """ konsole = {} +kf6-pty = {} diff --git a/local/docs/DEVICE-INIT-COMPREHENSIVE-IMPROVEMENT-PLAN.md b/local/docs/DEVICE-INIT-COMPREHENSIVE-IMPROVEMENT-PLAN.md new file mode 100644 index 00000000..ea274e72 --- /dev/null +++ b/local/docs/DEVICE-INIT-COMPREHENSIVE-IMPROVEMENT-PLAN.md @@ -0,0 +1,735 @@ +# Red Bear OS Low-Level Device Initialization — Comprehensive Improvement Plan + +**Date:** 2026-04-30 +**Scope:** Complete reassessment of boot-time device initialization: daemon inventory, firmware loading, driver model, bus enumeration, controller support, hardware validation +**Reference:** Linux 7.0 kernel device init model (full source available for comparison) +**Status:** Assessment phase — this document is the execution plan + +## 1. Executive Summary + +Red Bear OS has crossed the fundamental bring-up threshold: the system boots to a login prompt on +both QEMU and bounded bare-metal hardware (AMD Ryzen), device daemons start in a defined order, +and major subsystems (ACPI, PCI, USB/xHCI, NVMe, network) have in-tree implementations. + +However, the device initialization stack is **not release-grade**. Key deficiencies vs Linux 7.0: + +| Gap | Severity | Impact | +|-----|----------|--------| +| No proper device driver model (bus/device/driver binding) | CRITICAL | No deferred probing, no async init, no hotplug | +| No uevent/hotplug infrastructure (udev-shim is static enumerator only) | CRITICAL | No device add/remove notification; `udev-shim` is misnamed — it does a single PCI scan, not real udev | +| No EHCI/OHCI/UHCI USB controllers | HIGH | USB keyboard not reliable on bare metal | +| initfs vs rootfs driver duality — drivers started in initfs may conflict with rootfs drivers | HIGH | No explicit handoff contract for devices initialized in initfs | +| No hardware validation for MSI-X, IOMMU, xHCI interrupts | HIGH | QEMU-proven only; real hardware behavior unknown | +| No suspend/resume or runtime power management | HIGH | No S3/S4 sleep, no device power gating | +| No CPU frequency scaling or thermal management | MEDIUM | Battery life, thermal throttling absent | +| No hardware RNG daemon, no SMBIOS/DMI runtime | MEDIUM | Missing entropy source, missing quirk data | +| No PCIe AER, no advanced error reporting | MEDIUM | Silent device failures | +| Firmware loading GPU-only (no Wi-Fi, audio, media) | MEDIUM | Blocks iwlwifi, Bluetooth, media acceleration | +| No device naming policy or persistent device names | MEDIUM | `/dev/` names unstable across boots | +| No kernel cmdline for device parameterization | LOW | No runtime device config without rebuild | +| ACPI startup still carries panic-grade `expect` paths | HIGH | Boot fragility on diverse hardware | +| `acpid` `_S5` shutdown not release-grade | HIGH | Unclean shutdown on some platforms | +| Wi-Fi transport asserts on MSI-X (no legacy IRQ fallback) | HIGH | Wi-Fi won't work on older platforms | +| No EHCI companion controller routing for USB keyboards | HIGH | USB keyboard may be unreachable on some bare metal | +| No io_uring or epoll for async I/O in device daemons | LOW | Throughput ceiling for NVMe | + +### Bottom Line + +**Red Bear OS boots, but device initialization is naive by Linux 7.0 standards.** The microkernel +scheme-based driver model is architecturally sound, but the implementation lacks the maturity, +error resilience, hardware coverage, and power management depth that Linux 7.0 has accumulated +over 30 years of driver development. + +This plan defines a structured path to close these gaps over 5 phases (26-40 weeks). + +## 2. Current State Assessment + +### 2.1 Boot Flow + +``` +UEFI firmware → Bootloader → Kernel (kstart→kmain) → +userspace_init → bootstrap (procmgr) → initfs init → +├── Phase 1 (initfs): logd, nulld, randd, zerod, rtcd, ramfs +├── Phase 1 (initfs): inputd, lived +├── Phase 1 (initfs): vesad, fbbootlogd, fbcond (graphics target) +├── Phase 1 (initfs): hwd, pcid-spawner-initfs, ps2d (drivers target) +├── Phase 1 (initfs): rootfs mount → switchroot +├── Phase 2 (rootfs): ipcd, ptyd, pcid-spawner (base target) +│ ├── pcid-spawner spawns drivers matching PCI IDs: +│ │ ├── Storage: ahcid, ided, nvmed, virtio-blkd, usbscsid +│ │ ├── Network: e1000d, rtl8168d, rtl8139d, ixgbed, virtio-netd +│ │ ├── Graphics: vesad, ihdgd, virtio-gpud +│ │ ├── Input: ps2d, usbhidd +│ │ ├── Audio: ihdad, ac97d, sb16d +│ │ └── USB: xhcid, usbhubd +│ ├── smolnetd → dhcpd (network target) +│ ├── firmware-loader, udev-shim, evdevd, wifictl +│ ├── dbus-daemon → redbear-sessiond, seatd +│ └── console/getty → login prompt +``` + +### 2.2 Daemon Inventory — Existence and Quality + +#### Core Initfs Daemons (20 services) + +| Daemon | Quality | Notes | +|--------|---------|-------| +| `logd` | ✅ Hardened | Zero unwrap/expect; file descriptors, setrens, process loop | +| `nulld` | ✅ Hardened | Zero unwrap/expect | +| `randd` | ✅ Hardened | CPUID chain hardened; 8 test-only unwraps | +| `zerod` | ✅ Hardened | Args default + graceful exit | +| `rtcd` | ✅ Present | x86 RTC driver; minimal attack surface | +| `ramfs@` | ✅ Present | Template service for RAM filesystems | +| `inputd` | ✅ Hardened | 14 panic sites converted; partial vt events, buffer sizes | +| `lived` | ✅ Present | Live disk daemon | +| `vesad` | ✅ Hardened | 20 fixes; FRAMEBUFFER env, EventQueue, event loop, scheme | +| `fbbootlogd` | ✅ Hardened | 14 fixes; VT handle, graphics handle, dirty_fb | +| `fbcond` | ✅ Hardened | 14 fixes; VT parse, event loop, writes, scheme, display | +| `hwd` | ✅ Present | ACPI/DeviceTree boot handler | +| `pcid-spawner-initfs` | ✅ Hardened | initfs variant; oneshot_async | +| `ps2d` | ✅ Hardened | Controller init drains stale output; QEMU proof | +| `bcm2835-sdhcid` | ✅ Present | ARM-only (Raspberry Pi) | + +#### Core Rootfs Daemons (9 base services) + +| Daemon | Quality | Notes | +|--------|---------|-------| +| `ipcd` | ✅ Present | IPC daemon | +| `ptyd` | ✅ Present | Pseudo-terminal daemon | +| `pcid-spawner` | ✅ Hardened | Changed to oneshot_async (was blocking init); logs device info | +| `sudo` | ✅ Present | Privilege daemon | +| `smolnetd`/`netstack` | ✅ Present | TCP/IP stack | +| `dhcpd` | ✅ Present | DHCP client | +| `audiod` | ✅ Present | Audio multiplexer | + +#### PCI-Matched Device Drivers (pcid-spawner, 25+ drivers) + +| Category | Drivers | Quality | +|----------|---------|---------| +| Storage | ahcid, ided, nvmed, virtio-blkd, usbscsid | ✅ All hardened (Wave 4 complete) | +| Network | e1000d, rtl8168d, rtl8139d, ixgbed, virtio-netd | ✅ All hardened | +| Graphics | vesad, ihdgd, virtio-gpud | ✅ All hardened | +| Input | ps2d, usbhidd | ✅ All hardened | +| Audio | ihdad, ac97d, sb16d | ✅ All hardened | +| USB | xhcid, usbhubd, usbctl, ucsid | ✅ xhcid has 88 Red Bear patches | +| GPIO/I2C | gpiod, i2cd, intel-gpiod, amd-mp2-i2cd, dw-acpi-i2cd, i2c-gpio-expanderd, i2c-hidd, intel-thc-hidd, intel-lpss-i2cd | ✅ Present | +| System | pcid, pcid-spawner, acpid | ✅ Core infra; pcid hardened Wave 1-2 | +| VirtualBox | vboxd | ✅ x86 only | + +#### Custom Red Bear Daemons + +| Daemon | Quality | Notes | +|--------|---------|-------| +| `firmware-loader` | ✅ Well-tested | 18 unit tests; scheme:firmware with read/mmap; no signing | +| `redox-drm` | 🚡 Bounded compile | AMD+Intel+VirtIO display; 68 tests; no HW validation | +| `amdgpu` | 🚡 Bounded compile | Imported Linux DC/TTM/core; partial display glue | +| `iommu` | 🚡 QEMU-proven | AMD-Vi detection + first-use proof; no HW validation | +| `udev-shim` | ✅ Present | Scheme:udev with device enumeration | +| `evdevd` | ✅ Present | Linux-compatible evdev interface | +| `redbear-sessiond` | ✅ Present | D-Bus login1 session broker | +| `redbear-wifictl` | 🚡 Host-tested | Wi-Fi control daemon; no real hardware | +| `redbear-iwlwifi` | 🚡 Host-tested | Intel transport; ~2450 lines C + ~1550 lines Rust; 119 tests | +| `redbear-btusb` | 🔴 Experimental | BLE-first; USB-attached only; QEMU validation in progress | +| `redbear-authd` | ✅ Present | Local-user authentication | +| `redbear-greeter` | 🚡 Partial | Greeter orchestrator; Qt Wayland integration broken | +| `redbear-netctl` | ✅ Present | Network profile management | +| `redbear-hwutils` | ✅ Present | lspci, lsusb, phase checkers | + +### 2.3 Firmware Loading + +**What exists:** +- `scheme:firmware` daemon (`firmware-loader`) indexes blobs from `/lib/firmware/` +- `linux-kpi` provides `request_firmware()` via Rust FFI +- AMD GPU blobs (675 .bin files) in `local/firmware/amdgpu/` (gitignored, fetched from linux-firmware) +- Intel DMC display blobs fetchable via `fetch-firmware.sh --vendor intel --subset dmc` +- Two fetch mechanisms: standalone script (selective) + build-time meta-package (full linux-firmware) +- `PCI_QUIRK_NEED_FIRMWARE` flag defined (bit 11), but never checked by any driver + +**What is MISSING vs Linux 7.0 `firmware_class`:** +- No firmware signing/verification (no `module_sig_check` equivalent) +- No `request_firmware_nowait` with uevent dispatch to userspace helper (Linux uses `/sys/$DEVPATH/loading` + `/sys/$DEVPATH/data` + uevent to notify udev) +- No persistent firmware cache between boots (in-memory only; Linux caches during suspend for resume-fastpath) +- No fallback firmware variant search (if dmcub_dcn31.bin missing, try dmcub_dcn30.bin; Linux has per-driver firmware search paths) +- No `/sys/firmware/` interface (Linux exposes firmware loading status via sysfs) +- No firmware preloading at driver bind time +- No timeout for synchronous `request_firmware` (blocks forever; Linux times out after ~60s with uevent fallback) +- No platform firmware fallback (Linux can search UEFI firmware volumes via `firmware_request_platform()`) +- No Wi-Fi firmware blobs (iwlwifi, ath10k, etc.) +- No Bluetooth firmware blobs +- No audio/media codec firmware +- Firmware lookup limited to 3 hardcoded paths (Linux searches: `/lib/firmware/`, `/lib/firmware/updates/`, `/lib/firmware/$KVER/`, `/usr/lib/firmware/`, `/usr/share/firmware/`, plus custom path via kernel param) + +### 2.4 Hardware Validation Status + +| Subsystem | QEMU | Bare Metal | Notes | +|-----------|------|------------|-------| +| ACPI boot | ✅ | ✅ (AMD) | Boot-baseline; `_S5` shutdown not release-grade | +| x2APIC/SMP | ✅ | ✅ | Multi-core works | +| PCI enumeration | ✅ | ✅ | pcid enumerates devices | +| MSI-X | ✅ (virtio-net) | ❌ | No hardware proof | +| IOMMU/AMD-Vi | ✅ (first-use) | ❌ | Detection works; no HW validation | +| xHCI interrupt | ✅ | ❌ | Interrupt mode proven; no HW | +| USB storage | ✅ (readback) | ❌ | QEMU mass-storage proof | +| NVMe | ✅ | ❌ | Builds; no HW | +| AHCI | ✅ | ❌ | Builds; no HW | +| Network (e1000/virtio) | ✅ | ❌ | QEMU only | +| PS/2 keyboard | ✅ | ✅ | QEMU + AMD bare metal | +| USB keyboard | ✅ (QEMU HID) | ⚠️ | Not reliable on bare metal | +| Wi-Fi | ❌ | ❌ | Host-tested transport only | +| Bluetooth | ❌ | ❌ | Experimental BLE; QEMU in progress | + +### 2.5 Comparison with Linux 7.0 Device Init Model + +#### 2.5.1 Linux Initcall Ordering (Reference) + +Linux uses a 10-level initcall system for boot-phase ordering: + +| Level | Macro | Typical Count | Example Uses | +|-------|-------|---------------|--------------| +| 0 | `pure_initcall` | ~few | Pure infrastructure | +| early | `early_initcall` | ~446 | mm init, early console, DT scan | +| 1 | `core_initcall` | ~614 | Workqueues, RCU, memory allocators | +| 2 | `postcore_initcall` | ~150 | Clocksource, scheduler, IRQ core | +| 3 | `arch_initcall` | ~751 | PCI bus init, ACPI table parsing, CPU bringup | +| 4 | `subsys_initcall` | ~573 | PCI enumerate, USB core, networking core, block | +| 5 | `fs_initcall` | ~1372 | Filesystem registration | +| 6 | `device_initcall` | ~1211 | Most drivers; `module_init()` maps here | +| 7 | `late_initcall` | ~440 | Late init, debug, tracing | + +Red Bear OS has **no equivalent ordering mechanism** — the TOML-based init uses `requires_weak` +for loose ordering but has no topological sort depth, no `Before`/`After` fields, no explicit +init phases beyond the coarse initfs/rootfs split. + +#### 2.5.2 Feature Comparison Table + +| Feature | Linux 7.0 | Red Bear OS | Gap | +|---------|-----------|-------------|-----| +| **Driver model** | `bus_type` → `device_driver` → `probe()` binding with match tables | `pcid-spawner` spawns drivers by PCI class/vendor/device | 🟡 Partial — single-shot spawn, no rebinding | +| **Deferred probing** | `driver_deferred_probe` — retries when dependency arrives; `-EPROBE_DEFER` triggers retry on any successful probe | None | 🔴 Missing — must be present at boot | +| **Async probing** | `async_probe` — parallel driver init via kthreadd workers | Sequential spawn only | 🟡 Partial — oneshot_async for launch but not true async init | +| **Hotplug** | uevent netlink → udev → driver bind/unbind; `/sbin/hotplug` path | `udev-shim` is a **static PCI enumerator** — one scan at boot, no event callbacks, no device removal handling | 🔴 Missing — no hotplug infrastructure at all | +| **Firmware loading** | `firmware_class` with `request_firmware`, user helper, caching | `scheme:firmware` + `linux-kpi` request_firmware | 🟡 Partial — no uevent/helper/caching | +| **USB controllers** | xHCI, EHCI, OHCI, UHCI — all supported | xHCI only | 🔴 Missing — EHCI/OHCI/UHCI absent | +| **USB device classes** | HID, storage, audio, video, CDC, vendor, etc. | HID, hub, storage (BOT), CSI (UCSI) | 🟡 Partial — many classes missing | +| **Power management** | Suspend/resume, runtime PM, CPU freq scaling, thermal | `_S5` shutdown only | 🔴 Missing — no S3/S4/PM | +| **Interrupt handling** | Full APIC/x2APIC, MSI/MSI-X, affinity, NMI, MCE | APIC/x2APIC; MSI-X via quirks | 🟡 Partial — no affinity, no NMI watchdog | +| **IOMMU** | AMD-Vi, Intel VT-d with DMA remapping + IR | AMD-Vi detection + first-use proof | 🟡 Partial — no VT-d, no hardware | +| **ACPI namespace** | Full namespace: devices, thermal, battery, processor, etc. | Boot-baseline: MADT, FADT, `_S5`, bounded power | 🟡 Partial — many ACPI objects missing | +| **PCIe features** | AER, ACS, ATS, PRI, PASID, SR-IOV | Basic PCI config space only | 🔴 Missing — no advanced PCIe | +| **Device naming** | Predictable network/storage names (systemd udev) | None | 🟡 Partial — no naming policy | +| **Hardware RNG** | `hw_random` framework, multiple drivers | None | 🔴 Missing | +| **CPU frequency** | `cpufreq` governors | None | 🔴 Missing | +| **Thermal management** | `thermal` framework + drivers | None | 🔴 Missing | +| **SMBIOS/DMI** | Full DMI table exposure via sysfs | Quirks system has DMI data | 🟡 Partial — not runtime-exposed | +| **Kernel cmdline** | Device parameters via boot cmdline | None | 🔴 Missing | + +## 3. Implementation Phases + +### Phase 1 — Driver Model Maturation (Weeks 1-8) + +**Goal:** Establish a proper device driver model with binding semantics, deferred probing, +and error resilience — bringing the driver infrastructure to Linux 7.0 par without rewriting +existing drivers. + +#### 1.1 Device-Driver Binding Model (Week 1-3) + +Create a `redox-driver-core` library providing Linux-style bus/device/driver abstractions: + +``` +Device → Driver matching: + pcid: class=0x01, subclass=0x08 → nvmed + pcid: vendor=0x8086, device=0x10D3 → e1000d + +Driver probe() returns: + Ok(()) → device bound, driver active + Err(ENODEV) → device not supported by this driver + Err(EAGAIN) → dependency not available, DEFER probe + Err(...) → fatal error, device unusable +``` + +**Deliverables:** +- `redox-driver-core` crate with `Bus`, `Device`, `Driver` traits +- `pcid` exposes devices via new scheme: `scheme:pci/devices/{id}/bind` +- `pcid-spawner` replaced by `driver-manager` daemon that: + - Reads driver match tables from `/lib/drivers.d/*.toml` + - Probes drivers in priority order + - Supports deferred probing (EAGAIN → retry when dependency appears) + - Supports driver unbind/rebind +- All existing `pcid.d/*.toml` match files migrated to new format +- Backward compatible: existing pcid-spawner behavior preserved as fallback + +#### 1.2 Async Device Probing (Week 4-5) + +**Deliverables:** +- `driver-manager` probes independent device trees in parallel (using Rust async or threads) +- Device init order defined by dependency DAG, not sequential spawn +- Timing observability: log probe duration per driver +- `CONFIG_PARALLEL_PROBE` equivalent: max concurrent probes tunable via config TOML + +#### 1.3 Driver Parameter System (Week 6-7) + +**Deliverables:** +- Kernel cmdline parsing in bootloader (e.g., `redbear.nvme.irq_mode=msi`) +- `/scheme/sys/driver/{name}/parameters` read/write +- Driver authors declare parameters via derive macro +- `lspci -v` shows per-device parameters + +#### 1.4 Hotplug Infrastructure (Week 7-8) + +**Deliverables:** +- PCIe hotplug: `pcid` detects surprise removal/addition, emits uevent +- USB hotplug: `xhcid` emits uevent on device attach/detach +- `udev-shim` enhanced to receive uevents and trigger driver binding +- `driver-manager` handles hot-add (probe driver) and hot-remove (unbind driver) +- Initial scope: PCIe hotplug and USB hotplug only; Thunderbolt deferred + +**Phase 1 Exit Criteria:** +- New driver binding model functional for 3+ existing drivers (nvmed, e1000d, xhcid) +- Deferred probing works: driver returning EAGAIN retries when dependency scheme appears +- Async probing measurable: 2+ independent PCI devices probe concurrently +- Hotplug works: USB device attach/detach triggers udev-shim + driver bind/unbind in QEMU +- All 25+ existing drivers still compile and function (backward compatibility) + +### Phase 2 — Controller Coverage & Hardware Validation (Weeks 5-14) + +**Goal:** Fill the critical controller gaps (USB EHCI/OHCI/UHCI) and validate the +existing controller stack on real hardware — especially MSI-X, IOMMU, and xHCI. + +#### 2.1 USB Controller Family Completion (Week 5-9) + +This is the **highest-impact controller gap** because it directly blocks reliable +USB keyboard input on bare metal where the keyboard may be routed through companion +controllers rather than xHCI. + +**Deliverables:** +- `ehcid` daemon — EHCI (USB 2.0) host controller driver +- `ohcid` daemon — OHCI (USB 1.1) host controller driver for non-Intel chipsets +- `uhcid` daemon — UHCI (USB 1.1) host controller driver for Intel chipsets +- USB companion controller routing: when xHCI owns the ports, companion controllers + hand off low/full-speed devices to xHCI transparently +- `usb-manager` daemon orchestrates multi-controller topology: + - Single `scheme:usb` root exposing all buses + - Device path stability across controller types + - Port routing table for companion controller ownership handoff +- USB 3.1/3.2 SuperSpeedPlus support in xhcid (10 Gbps, 20 Gbps) +- USB-C PD/alt-mode awareness in `ucsid` + +**Implementation approach:** +- EHCI: Reference Linux `drivers/usb/host/ehci-hcd.c` (~6000 lines) and FreeBSD `sys/dev/usb/controller/ehci.c` +- OHCI: Reference Linux `drivers/usb/host/ohci-hcd.c` (~3000 lines) +- UHCI: Reference Linux `drivers/usb/host/uhci-hcd.c` (~2500 lines) +- All three controllers use the same `scheme:usb` interface — class daemons (usbhubd, usbhidd, usbscsid) work unchanged + +#### 2.2 xHCI Device-Level Hardening (Week 8-10) + +Per the existing `XHCID-DEVICE-IMPROVEMENT-PLAN.md`: + +**Deliverables:** +- Atomic device attach publication (prevent half-attached devices) +- Bounded device detach and purge +- Configure rollback on failure +- Real PM sequencing (U0/U1/U2/U3 transitions) +- Enumerator cleanup and timing hardening +- Growable event ring under sustained activity + +#### 2.3 MSI-X Hardware Validation (Week 8-11) + +Per the existing `IRQ-AND-LOWLEVEL-CONTROLLERS-ENHANCEMENT-PLAN.md` Priority 1: + +**Deliverables:** +- AMD GPU MSI-X validation: prove MSI-X vectors fire correctly on real AMD hardware +- Intel GPU MSI-X validation: prove MSI-X on Intel hardware +- NVMe MSI-X validation: prove per-queue interrupt vectors +- xHCI MSI-X validation: prove interrupt-driven event ring on real hardware (not just QEMU) +- Verified MSI-X → MSI → legacy IRQ fallback on all tested hardware +- Logged CPU/vector affinity behavior +- At minimum one AMD and one Intel bare-metal test report per device class + +#### 2.4 IOMMU Hardware Bring-Up (Week 9-14) + +Per the existing `IRQ-AND-LOWLEVEL-CONTROLLERS-ENHANCEMENT-PLAN.md` Priority 2: + +**Deliverables:** +- Validated AMD-Vi initialization on real AMD hardware +- Device table / command buffer / event log validation +- Interrupt remapping validation +- Intel VT-d initial detection and register mapping (not full bring-up) +- IOMMU fault-path validation: inject fault, verify event log capture +- DMA remapping proof: verify device DMA is translated through IOMMU page tables +- Negative-result documentation if hardware still fails + +#### 2.5 ACPI Wave 1-2 Completion (Week 10-12) + +Per the existing `ACPI-IMPROVEMENT-PLAN.md` Waves 1-2: + +**Deliverables:** +- Finish replacing panic-grade `expect` paths in `acpid` startup +- Define and document AML bootstrap contract (explicit RSDP_ADDR producer) +- Table-specific reject/warn/degrade/fail rules implemented +- Deterministic `_S5` derivation (not dependent on PCI timing) +- Explicit shutdown/reboot result semantics +- Bounded shutdown proof on real AMD and Intel hardware +- Sleep-state scope explicit: S5 only; S3/S4 explicitly deferred + +**Phase 2 Exit Criteria:** +- At least one EHCI or OHCI/UHCI driver functional in QEMU +- USB keyboard reliably reachable on bare metal AMD and Intel (via xHCI, EHCI, or companion routing) +- MSI-X validated on at least one real AMD GPU and one real Intel GPU +- IOMMU AMD-Vi validated on at least one real AMD machine +- ACPI `_S5` shutdown works on at least one real AMD and one real Intel machine +- ACPI startup contains zero panic-grade paths reachable from firmware input + +### Phase 3 — Power Management & Platform Services (Weeks 12-20) + +**Goal:** Add suspend/resume, CPU frequency scaling, thermal management, and hardware +RNG — bringing platform services to Linux 7.0 par for basic functionality. + +#### 3.1 ACPI Power Management (Week 12-14) + +Per the existing `ACPI-IMPROVEMENT-PLAN.md` Waves 3-4: + +**Deliverables:** +- Honest `/scheme/acpi/power` surface: exposes only behavior with runtime evidence +- Consumer-visible distinction between unsupported, unavailable, and populated power state +- Reduced surface: remove misleading empty-success defaults +- AML physmem/EC failure propagation: no correctness-critical fabricated values +- EC error typing and documented widened-access behavior +- Documented AML mutex timeout behavior + +#### 3.2 Suspend/Resume (S3 Sleep) — Initial Implementation (Week 13-16) + +**Deliverables:** +- Kernel: save/restore CPU context (CR0-CR4, MSRs, IDT/GDT, FPU/SSE/AVX state) +- Kernel: ACPI S3 (suspend-to-RAM) entry via `_S3` AML method +- Kernel: wake vector registration and resume path +- `acpid`: expose `/scheme/acpi/sleep` with `S3` and `S5` states +- Device contract: `suspend()` callback on each scheme daemon + - Storage: flush caches, park heads (if spinning) + - Network: bring link down, save MAC filter state + - USB: save controller/port state + - Graphics: save mode, blank display +- `driver-manager`: suspend devices in dependency order, resume in reverse +- Initial scope: S3 only on test hardware; S4 (hibernate) explicitly deferred + +#### 3.3 CPU Frequency Scaling (Week 14-16) + +**Deliverables:** +- `cpufreqd` daemon reading ACPI `_PSS` / `_PPC` objects +- Intel: P-state MSR writes (IA32_PERF_CTL) +- AMD: P-state MSR writes + CPPC awareness +- Governors: `performance` (max freq), `powersave` (min freq), `ondemand` (load-based) +- `/scheme/cpufreq` for reading/setting governor and frequency +- `redbear-info` shows current frequency and governor + +#### 3.4 Thermal Management (Week 15-17) + +**Deliverables:** +- `thermald` daemon reading ACPI thermal zone objects (`_TMP`, `_PSV`, `_TC1`, `_TC2`) +- Active cooling: fan control via ACPI `_SCP` +- Passive cooling: CPU throttling via cpufreqd integration +- Critical shutdown: if temperature exceeds `_CRT`, initiate clean shutdown +- `/scheme/thermal` for reading zone temperatures and trip points +- `redbear-info` shows thermal zone status + +#### 3.5 Hardware RNG (Week 16-17) + +**Deliverables:** +- `hwrngd` daemon reading hardware RNG sources: + - x86 RDRAND/RDSEED instructions + - TPM 2.0 random number generator (if present) + - VirtIO entropy device +- `scheme:hwrng` feeding into `randd` entropy pool +- `/scheme/hwrng` exposes raw entropy and health status +- Linux 7.0 `hw_random` framework ported conceptually (not literally) + +#### 3.6 PCIe Advanced Error Reporting (Week 17-18) + +**Deliverables:** +- `pcid` exposes AER capability registers via `/scheme/pci/{dev}/aer` +- AER error detection: correctable and uncorrectable error status registers +- Error logging: decode error source (data link, transaction, poison TLP, etc.) +- `aer-inject` utility for testing error paths +- Initial scope: error detection and logging only; error recovery (device reset path) deferred + +#### 3.7 SMBIOS/DMI Runtime Exposure (Week 18-20) + +**Deliverables:** +- `dmidecode`-equivalent utility using `acpid` DMI scheme +- `/scheme/dmi` exposes SMBIOS entry point and table data +- `lspci -v` shows DMI-based quirk annotations +- DMI data feeding into `redbear-info` for platform identification +- Integration with existing quirks system: DMI match rules validated at runtime + +**Phase 3 Exit Criteria:** +- S3 suspend/resume works on at least one real machine (AMD or Intel) +- CPU frequency scaling observable via `redbear-info` +- Thermal zone temperature readable and critical shutdown testable +- Hardware RNG feeding entropy pool +- PCIe AER errors logged on capable hardware +- DMI data accessible via scheme and tools +- All new schemes documented with test procedures + +### Phase 4 — Firmware Infrastructure & Wi-Fi Validation (Weeks 16-24) + +**Goal:** Close firmware loading gaps, complete Wi-Fi hardware validation with real +firmware, and establish firmware management as a first-class platform service. + +#### 4.1 Firmware Loading Gap Closure (Week 16-18) + +**Deliverables:** +- `request_firmware_nowait` with proper uevent dispatch: + - Async request → uevent → `udev-shim` listens → `firmware-loader` serves blob + - Timeout: if firmware not available within configurable timeout, fail gracefully +- Firmware fallback variant search: + - If `dmcub_dcn31.bin` not found, try `dmcub_dcn30.bin`, `dmcub_dcn20.bin` + - Per-driver fallback chain defined in `/etc/firmware-fallbacks.d/*.toml` +- Persistent firmware cache (`/var/lib/firmware/`): + - Loaded blobs cached on first use; survive daemon restart + - Cache invalidation on firmware version change +- `PCI_QUIRK_NEED_FIRMWARE` enforcement: + - Drivers actually check the flag via `pci_has_quirk()` + - When flag is set: require firmware at probe time, fail probe if absent + - When flag is absent: firmware is optional, warn if missing but continue +- Fetch Intel Wi-Fi firmware blobs: `fetch-firmware.sh --vendor intel --subset wifi` +- Fetch Bluetooth firmware blobs where applicable +- Firmware manifest: `/lib/firmware/MANIFEST.txt` lists all blobs, versions, sources + +#### 4.2 Wi-Fi Hardware Validation (Week 16-22) + +Per the existing `WIFI-IMPLEMENTATION-PLAN.md`: + +**Deliverables:** +- Real Intel Wi-Fi device (e.g., AX200/AX201/AX210) validated end-to-end +- `redbear-iwlwifi` transport: + - Firmware loaded via `request_firmware()` → `scheme:firmware` + - DMA ring operation validated (TX reclaim, RX restock, command dispatch) + - Interrupt handling validated (MSI-X or MSI path) + - Association/authentication cycle completed with real AP +- `redbear-wifictl` control plane: + - Scan → connect → DHCP → disconnect cycle validated + - WPA2-PSK and open network profiles functional + - Profile persistence and boot-time application +- `redbear-netctl` Wi-Fi profiles: + - SSID/Security/Key parsing validated + - Bounded Wi-Fi lifecycle (prepare → init-transport → activate-nic → connect → disconnect) +- Wi-Fi runtime diagnostics: + - `redbear-phase5-wifi-check` reports link quality, signal strength, connected AP + - `redbear-info --verbose` shows Wi-Fi adapter status +- At minimum one real Intel Wi-Fi chipset validated +- Legacy IRQ fallback for platforms where MSI-X is unavailable (via quirks) + +#### 4.3 Wi-Fi Desktop API (Week 20-24) + +**Deliverables:** +- D-Bus Wi-Fi API on system bus: `org.freedesktop.NetworkManager` subset + - `GetDevices`, `GetAccessPoints`, `ActivateConnection`, `DeactivateConnection` + - Signal: `AccessPointAdded`, `AccessPointRemoved`, `StateChanged` +- `redbear-wifictl` exposes D-Bus interface for desktop consumption +- `redbear-netctl` GUI client for scanning and connecting (Qt6-based, optional) +- Desktop status bar Wi-Fi indicator (future KDE plasma-nm integration) + +**Phase 4 Exit Criteria:** +- `request_firmware_nowait` with uevent dispatch functional in QEMU +- PCI_QUIRK_NEED_FIRMWARE enforced in at least one driver (amdgpu or iwlwifi) +- Intel Wi-Fi chipset validated end-to-end with real AP +- Wi-Fi scan → connect → DHCP → internet access completed on real hardware +- Wi-Fi D-Bus API functional for at least get_devices and get_accesspoints +- Firmware manifest tracks all loaded blobs with versions + +### Phase 5 — Bluetooth, Device Policy, Polish (Weeks 20-30) + +**Goal:** Bring Bluetooth to validated experimental status, establish device naming policy, +and polish remaining gaps. + +#### 5.1 Bluetooth Hardware Validation (Week 20-24) + +Per the existing `BLUETOOTH-IMPLEMENTATION-PLAN.md`: + +**Deliverables:** +- `redbear-btusb` transport validated with real USB Bluetooth adapter +- `redbear-btctl` HCI host validated: + - Controller init sequence (reset, read local features, set event mask) + - Device discovery (LE scan → advertising report → connect) + - GATT service discovery + - Basic data exchange (battery service, device info) +- BLE peripheral connect/disconnect cycle validated +- Bluetooth classic (BR/EDR) detection and basic inquiry (connect deferred) +- `redbear-bluetooth-battery-check` works on real hardware +- At minimum one real USB Bluetooth adapter validated + +#### 5.2 Device Naming Policy (Week 22-24) + +**Deliverables:** +- Predictable network interface names: + - `enp0s1` instead of `eth0` (PCIe bus/device/function based) + - `/etc/systemd/network/` equivalent rules in `/etc/udev/rules.d/` +- Predictable storage device names: + - NVMe: `nvme0n1` instead of raw scheme path + - AHCI: `sd{a,b,c}` assigned by port order + - USB storage: `sdX` with stable enumeration +- `/dev/disk/by-id/`, `/dev/disk/by-path/`, `/dev/disk/by-uuid/` symlinks +- `udev-shim` enhanced with rule matching (vendor, model, serial, path patterns) + +#### 5.3 Device Init Observability (Week 23-25) + +**Deliverables:** +- Boot-time device init timeline: log each device probe start/end with duration +- `redbear-info --boot` shows device init timeline post-boot +- Per-device init status: `redbear-info --device pci/00:02.0` +- Kernel cmdline `redbear.init_verbose` enables verbose device init logging +- Boot-time warning summary: all drivers that probed with warnings or deferrals +- Device init health dashboard: `redbear-info --health` shows init status of all subsystems + +#### 5.4 Remaining Gaps (Week 24-30) + +**Deliverables:** +- `nvmed` hardware validation: prove NVMe I/O on real hardware +- `ahcid` hardware validation: prove SATA I/O on real hardware +- `ihdad` hardware validation: prove audio output on real hardware +- USB device class coverage expanded: + - USB CDC ACM (serial): `usbcdcd` daemon + - USB CDC ECM/NCM (ethernet): `usbnetd` daemon (or integrate into existing net drivers) + - USB Audio Class 1/2: `usbaudiod` daemon +- GPU hardware acceleration readiness: + - Mesa radeonsi backend proof-of-concept (single draw call) + - KMS atomic modesetting proof on real hardware (not just QEMU) +- `redbear-btusb` autospawn via USB class matching +- `kstop` shutdown event: gracefully stop all device daemons before power-off + +**Phase 5 Exit Criteria:** +- Bluetooth BLE discovery and basic data exchange works on real hardware +- Network interfaces use predictable names on QEMU and bare metal +- Device init timeline observable via `redbear-info --boot` +- NVMe I/O validated on at least one real NVMe drive +- Real audio output validated on at least one HDA codec +- At least one USB device class beyond HID/storage validated (audio, serial, or ethernet) +- All 25+ existing drivers maintain backward compatibility + +## 4. Dependency Graph + +``` +Phase 1 (Driver Model) ─────────────────────────────┐ + ├── 1.1 Binding Model │ + ├── 1.2 Async Probing (after 1.1) │ + ├── 1.3 Driver Parameters (after 1.1) │ + └── 1.4 Hotplug (after 1.1) │ + │ +Phase 2 (Controllers) ───────────────────────────────┤ + ├── 2.1 USB EHCI/OHCI/UHCI (parallel with 1.2) │ + ├── 2.2 xHCI Hardening (parallel with 1.2) │ + ├── 2.3 MSI-X HW Validation (after 1.1) │ + ├── 2.4 IOMMU HW Bring-Up (parallel with 2.3) │ + └── 2.5 ACPI Wave 1-2 (parallel with 2.3) │ + │ +Phase 3 (Power Mgmt) ────────────────────────────────┤ + ├── 3.1 ACPI Wave 3-4 (after 2.5) │ + ├── 3.2 Suspend/Resume (after 3.1) │ + ├── 3.3 CPU Freq Scaling (parallel with 3.2) │ + ├── 3.4 Thermal Mgmt (after 3.1, parallel 3.3) │ + ├── 3.5 Hardware RNG (parallel with 3.3) │ + ├── 3.6 PCIe AER (after 2.3) │ + └── 3.7 SMBIOS/DMI (parallel with 3.6) │ + │ +Phase 4 (Firmware + Wi-Fi) ──────────────────────────┤ + ├── 4.1 Firmware Gaps (after 1.1) │ + ├── 4.2 Wi-Fi HW (after 4.1, parallel with 2.3) │ + └── 4.3 Wi-Fi Desktop API (after 4.2) │ + │ +Phase 5 (Bluetooth + Polish) ────────────────────────┤ + ├── 5.1 BT HW Validation (parallel with 4.2) │ + ├── 5.2 Device Naming (after 1.1) │ + ├── 5.3 Init Observability (after 1.2) │ + └── 5.4 Remaining Gaps (after 3.2, 4.2, 5.1) │ +``` + +## 5. Resource Estimates + +| Phase | Duration | Engineers | Key Risk | +|-------|----------|-----------|----------| +| Phase 1 | 8 weeks | 2 | Over-engineering the driver model; must stay backward compatible | +| Phase 2 | 6-9 weeks | 3 (parallelizable) | Real hardware availability; USB controller complexity | +| Phase 3 | 8 weeks | 2-3 | ACPI firmware quality varies wildly on real hardware | +| Phase 4 | 8 weeks | 2 | Wi-Fi hardware procurement; firmware licensing | +| Phase 5 | 10 weeks | 2 | Long tail of device class drivers | + +**Total:** 26-40 weeks (~6-10 months) with 2-3 engineers, depending on parallelism and +hardware availability. + +## 6. Risk Register + +| Risk | Probability | Impact | Mitigation | +|------|-------------|--------|------------| +| No access to AMD GPU with MSI-X | Medium | High | Partner with community; use Intel GPU as alternative | +| No access to AMD machine with IOMMU | Medium | High | Prioritize Intel VT-d if AMD hardware unavailable | +| USB EHCI/OHCI/UHCI significantly harder than estimated | Medium | High | Scope to EHCI-only initially; UHCI/OHCI deferred | +| ACPI firmware corruption on test machines causes false failures | High | Medium | Test on 3+ machines per platform class | +| Wi-Fi firmware licensing prevents redistribution | Low | Medium | Keep firmware external (fetched, not committed) | +| Existing driver regression from new driver model | Medium | High | Extensive backward compat testing; parallel old/new paths | +| S3 suspend/resume crashes unrecoverably on some hardware | High | Medium | Gate behind config flag; S3 is opt-in initially | + +## 7. Success Criteria (Definition of Done) + +This plan is complete when: + +1. **Driver Model:** New driver binding model works for all existing drivers; deferred probing + retries correctly; async probing measurably parallel; hotplug adds/removes devices without reboot. + +2. **USB Controllers:** At least one non-xHCI controller (EHCI preferred) functional; USB keyboard + reliable on bare metal AMD and Intel. + +3. **Hardware Validation:** MSI-X proven on real AMD + Intel GPU; IOMMU AMD-Vi proven on real + AMD machine; ACPI `_S5` shutdown proven on real AMD + Intel; NVMe I/O proven on real hardware. + +4. **Power Management:** S3 suspend/resume works on at least one real machine; CPU frequency + scaling observable; thermal shutdown testable. + +5. **Firmware:** `request_firmware_nowait` with uevent dispatch; `PCI_QUIRK_NEED_FIRMWARE` + enforced; Wi-Fi firmware loaded end-to-end on real hardware. + +6. **Wi-Fi:** Intel Wi-Fi chipset validated end-to-end with real AP; scan → connect → DHCP → + internet access verified. + +7. **Bluetooth:** BLE discovery and basic data exchange on real hardware; HCI init sequence + validated; GATT service discovery functional. + +8. **Observability:** Device init timeline observable; per-device init status queryable; + boot-time warning summary available. + +9. **No regressions:** All 25+ existing drivers still work; all QEMU validation scripts still pass; + `redbear-mini` and `redbear-full` still boot to login prompt. + +## 8. Relationship to Existing Plans + +This plan is the **canonical device initialization plan**. It supersedes or integrates with: + +| Existing Plan | Relationship | +|---------------|-------------| +| `IRQ-AND-LOWLEVEL-CONTROLLERS-ENHANCEMENT-PLAN.md` | Absorbed: MSI-X (P1), IOMMU (P2) become Phase 2.3-2.4 here | +| `ACPI-IMPROVEMENT-PLAN.md` | Integrated: Waves 1-4 become Phase 2.5 + Phase 3.1-3.2 here | +| `USB-IMPLEMENTATION-PLAN.md` | Integrated: xHCI hardening + controller gaps become Phase 2.1-2.2 here | +| `XHCID-DEVICE-IMPROVEMENT-PLAN.md` | Integrated: 7-phase xhcid plan consolidated into Phase 2.2 here | +| `WIFI-IMPLEMENTATION-PLAN.md` | Absorbed: Wi-Fi hardware validation becomes Phase 4.2 here | +| `BLUETOOTH-IMPLEMENTATION-PLAN.md` | Absorbed: BT validation becomes Phase 5.1 here | +| `BOOT-PROCESS-ASSESSMENT.md` | Input: boot flow, service ordering, pcid-spawner fix already applied | +| `BOOT-PROCESS-IMPROVEMENT-PLAN.md` | Input: kernel 4GiB fix, DRM/KMS, greeter UI (already addressed) | +| `CONSOLE-TO-KDE-DESKTOP-PLAN.md` | Orthogonal: this plan focuses on device init, not desktop path | + +Existing plans remain as reference material for historical detail and subsystem-specific +technical depth. This plan is the execution authority for sequencing and acceptance criteria. + +## 9. Immediate Next Actions (Week 1 Priorities) + +1. **Create `redox-driver-core` crate** — define `Bus`, `Device`, `Driver` traits +2. **Read Linux 7.0 `drivers/base/driver.c`** — understand the driver binding model to adapt +3. **Audit `pcid` scheme interface** — what device info is already exposed vs what's needed +4. **Select USB EHCI reference implementation** — Linux `ehci-hcd.c` or FreeBSD `ehci.c` +5. **Procure test hardware** — at minimum: one AMD machine with AMD GPU + one Intel machine with Intel GPU +6. **Set up USB keyboard test matrix** — catalog existing USB keyboards and host controllers +7. **Create firmware manifest template** — define format for `/lib/firmware/MANIFEST.txt` +8. **Schedule MSI-X hardware validation session** — reserve time on test machines for Phase 2.3 + +--- + +*This plan will be updated as implementation progresses. Each phase section will receive +detailed task breakdown (similar to the ACPI and IRQ plans' execution slice format) before +that phase begins.* diff --git a/local/docs/KERNEL-SCHEDULER-MULTITHREAD-IMPROVEMENT-PLAN.md b/local/docs/KERNEL-SCHEDULER-MULTITHREAD-IMPROVEMENT-PLAN.md new file mode 100644 index 00000000..79948ee4 --- /dev/null +++ b/local/docs/KERNEL-SCHEDULER-MULTITHREAD-IMPROVEMENT-PLAN.md @@ -0,0 +1,1026 @@ +# Red Bear OS — Kernel Scheduler, Multithreading, and IPC Performance Improvement Plan + +**Date:** 2026-04-30 +**Scope:** Kernel scheduler optimization, futex enhancements, multithreaded performance, relibc POSIX threading completeness +**Status:** S3 complete (per-CPU + stealing + balancing + placement), S4 complete (futex sharding + REQUEUE + PI + robust + vruntime), S5 complete (setpriority + affinity + naming + schedparam), S6 partial (cache-affine delivered, NUMA deferred). This is the **canonical scheduler + multithreading authority**, extending `KERNEL-IPC-CREDENTIAL-PLAN.md` and `RELIBC-IPC-ASSESSMENT-AND-IMPROVEMENT-PLAN.md` + +--- + +## 1. Executive Summary + +The Redox microkernel currently uses a **Deficit Weighted Round Robin (DWRR)** scheduler with 40 static priority levels, per-CPU run queues, and cooperative preemption. The relibc C library provides a largely complete pthreads implementation, but POSIX scheduling APIs (`sched_*`, `pthread_setschedparam`) are stubbed out. For the KDE/Wayland desktop path, multithreaded performance bottlenecks in the scheduler and futex subsystem will become the dominant limitation once the compositor (KWin) and GPU rendering pipelines are active. + +### Current State at a Glance + +| Area | Status | Key Gaps | +|------|--------|----------| +| Kernel scheduler | DWRR, 40 levels, vruntime selection for SCHED_OTHER, RT pass for FIFO/RR | Per-CPU run queues are infrastructure only; load balancing deferred | +| Futex | WAIT/WAIT64/WAKE + 64-shard hash table | No PI, no requeue, no robust futex | +| relibc pthreads | Create/join/detach/mutex/cond/rwlock/barrier/spin/tls | `sched_*` all `todo!()`, no PI/robust mutexes, no affinity API | +| Thread management | proc: scheme clone/fork/exec | No dynamic priority, no CPU affinity from userspace, no thread groups | +| IPC for threading | Futex, shared memory, signals | No process-shared robust/PI mutexes, no adaptive spinning | + +### Why This Matters for the Desktop Path + +``` +KWin compositor (Qt6/QPA/Wayland) + └── Worker threads: rendering, input, effects + └── Requires: efficient futex wakeups, PI for compositor lock + └── Requires: SCHED_RR for input thread priority + +Mesa GPU driver (LLVMpipe or hardware) + └── Gallium worker threads: shader compilation, draw submission + └── Requires: load-balanced scheduling across all CPUs + └── Requires: non-contended futex performance + +Qt6 event loop + └── Thread pool for QFuture/QtConcurrent + └── Requires: SCHED_OTHER fair scheduling under load + └── Requires: proper pthread_attr_setschedparam +``` + +--- + +## 2. Current Architecture Assessment + +### 2.1 Scheduler Architecture + +**File:** `recipes/core/kernel/source/src/context/switch.rs` + +**Algorithm:** Deficit Weighted Round Robin (DWRR) — documented at line 354: +```rust +/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler +fn select_next_context(...) +``` + +**Key data structures** (from `context/mod.rs`): + +```rust +// 40 priority levels, each with its own queue +pub struct RunContextData { + set: [VecDeque; 40], +} + +// Global lock for run queues (L1 = highest-level lock) +static RUN_CONTEXTS: Mutex = ...; + +// Idle/sleeping contexts — scanned linearly on every tick +static IDLE_CONTEXTS: Mutex> = ...; + +// All contexts (for enumeration) +static CONTEXTS: RwLock> = ...; +``` + +**Priority weights** (geometric decay ~1.25x per level): +```rust +const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [ + 88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, + 9548, 7620, 6100, 4904, 3906, 3121, 2501, 1991, 1586, 1277, + 1024, 820, 655, 526, 423, 335, 272, 215, 172, 137, + 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, +]; +``` + +**Time quantum:** 3 PIT ticks per context (~12.2ms). PIT channel 0 has divisor 4847 at 1.193182 MHz → ~4.062ms per tick. 3 ticks → ~12.2ms between context switches. The 6.75ms in the tick() comment is outdated. +**Default priority:** 20 (middle of range). +**Max scheduler iterations:** 5000 per `select_next_context` call (bail-out limit). +**Per-CPU state:** `percpu.balance: [usize; 40]` (deficit counters), `percpu.last_queue` (round-robin position). +**Preemption:** Preemptible unless `context.preempt_locks > 0` (guarded by `PreemptGuard` RAII wrappers). +**Context switch lock:** Global `arch::CONTEXT_SWITCH_LOCK` — spinlock with `compare_exchange_weak` on `Ordering::SeqCst`. + +**Current limitations:** +1. **No real-time scheduling wired to userspace** — kernel has SchedPolicy enum and RT scheduling pass, but relibc `sched_setscheduler` returns ENOSYS for FIFO/RR until kernel wire-up is complete. +2. **No dynamic priority adjustment** — `context.prio` is set once and never changes. vruntime-based fairness compensates for SCHED_OTHER but no nice-value decay/boost. +3. **No work stealing** — each CPU only dequeues from its own queues. A CPU can go idle while another has backlog. +4. **No load balancing** — newly created contexts go to the creating CPU's idle queue. No migration across CPUs. +5. **O(n) idle wakeup scan** — `wakeup_contexts()` linearly scans the entire `IDLE_CONTEXTS` VecDeque on every tick (every ~2.25ms effective). +6. **Single global context switch lock** — `arch::CONTEXT_SWITCH_LOCK` serializes all CPU context switches on many-core systems. +7. **No NUMA awareness** — memory locality is not considered during scheduling. +8. **No timeslice scaling** — all contexts get the same 3-tick quantum regardless of priority (priority only affects how often they're picked, not how long they run). +9. **Large fixed iteration limit** — 5000 iterations per schedule attempt can cause latency spikes under heavy load. + +### 2.2 Context/Thread Model + +**File:** `recipes/core/kernel/source/src/context/context.rs` + +```rust +pub struct Context { + pub prio: usize, // Priority (0-39, default 20) + pub status: Status, // Runnable / Blocked / HardBlocked / Dead + pub running: bool, // Currently on a CPU + pub cpu_id: Option,// Which CPU this context is on + pub sched_affinity: LogicalCpuSet, // Allowed CPU set + pub cpu_time: u128, // Accumulated CPU time (nanoseconds) + pub switch_time: u128, // Last switch-in time + pub wake: Option, // Wake timestamp for timed sleeps + pub preempt_locks: usize, // Preemption disable counter + pub kfx: AlignedBox, // SIMD/FPU save area + pub addr_space: Option>, // Can be shared (threads) + pub files: Arc, // Can be shared (same process threads) + pub owner_proc_id: Option, // Parent process + pub name: ArrayString<32>, // Human-readable name + // Credentials: + pub euid: u32, pub egid: u32, pub pid: usize, + pub groups: Vec, // Supplementary groups +} +``` + +**Thread creation flow:** +``` +pthread_create() + → relibc::pthread::create() + → mmap() for stack + → Tcb::new() for TLS + → stack setup with entry shim + → Sys::rlct_clone(stack, os_specific) + → redox_rt::clone() + → proc: scheme -> kernel clone + → Context::new() (same owner_proc_id, shared addr_space) + → context::spawn() (pushed to IDLE_CONTEXTS) +``` + +**Key architectural points:** +- Threads share the same `addr_space: Arc` (same page tables) +- Threads share `files: Arc` (same FD table) +- Thread ownership via `owner_proc_id` — but no formal thread group concept +- No distinction between process and thread at kernel level — all are Contexts +- `pid` is set once, no `tgid`/`tid` distinction + +### 2.3 Futex Implementation + +**File:** `recipes/core/kernel/source/src/syscall/futex.rs` + +```rust +// Global hash table: PhysicalAddress → Vec +type FutexList = HashMap>; +static FUTEXES: Mutex = ...; + +pub struct FutexEntry { + target_virtaddr: VirtualAddress, + context_lock: Arc, + addr_space: Weak, // For CoW safety +} +``` + +**Supported operations:** +| Op | Status | Notes | +|----|--------|-------| +| `FUTEX_WAIT` (32-bit) | ✅ | Validates alignment (4-byte), checks value, blocks | +| `FUTEX_WAIT64` (64-bit) | ✅ | x86_64 only, checks alignment (8-byte) | +| `FUTEX_WAKE` | ✅ | Wakes up to `val` waiters, `O(n)` scan by virtual address matching | + +**NOT supported (critical gaps):** +| Op | Impact | +|----|--------| +| `FUTEX_REQUEUE` | Cannot move waiters between futexes — needed by condvar broadcast | +| `FUTEX_CMP_REQUEUE` | Cannot atomically compare-and-requeue — race condition risk | +| `FUTEX_WAKE_OP` | Cannot do atomic op + wake — needed by glibc mutex fast path | +| `FUTEX_LOCK_PI` | No priority inheritance — PTHREAD_PRIO_INHERIT is a stub | +| `FUTEX_TRYLOCK_PI` | No trylock with PI | +| `FUTEX_UNLOCK_PI` | No unlock with PI | +| `FUTEX_CMP_REQUEUE_PI` | No requeue with PI | +| `FUTEX_WAIT_BITSET` | No bitset wait — needed for `pselect`/`ppoll` optimization | +| `FUTEX_WAKE_BITSET` | No bitset wake | +| `FUTEX_WAIT_MULTIPLE` | As noted in code TODO, not implemented | +| `FUTEX_PRIVATE` flag | Conceptual TODO in code comment — "implement fully in userspace" | + +**Performance concerns:** +1. **Global `FUTEXES` mutex** — all futex operations on all CPUs contend on a single L1 lock +2. **O(n) wake scan** — `FUTEX_WAKE` iterates all entries for a physical address to match by virtual address +3. **Full `HashMap` entry removal** — on wake, entry is `swap_remove`'d; on last waiter, the entire `HashMap` entry is removed (churn) +4. **No per-process futex isolation** — all futexes share the same global table, even process-private ones +5. **No wait-multiple** — waking multiple independent futexes requires multiple syscalls + +### 2.4 relibc pthread Completeness + +**Files:** `src/pthread/mod.rs`, `src/header/pthread/*.rs`, `src/header/sched/mod.rs` + +| API Surface | Status | Notes | +|-------------|--------|-------| +| `pthread_create` / `pthread_join` / `pthread_detach` | ✅ Full | Stack via mmap, TLS init, waitval for join | +| `pthread_mutex_*` (normal, recursive, errorcheck) | ✅ Full | Internal implementation in `src/sync/` | +| `pthread_cond_*` | ✅ Full | Condition variables present | +| `pthread_rwlock_*` | ✅ Full | Read-write locks present | +| `pthread_barrier_*` | ✅ Full | Barriers present | +| `pthread_spin_*` | ✅ Full | Spinlocks present | +| `pthread_key_*` / TLS | ✅ Full | Thread-local storage with destructors | +| `pthread_once` | ✅ Full | call_once pattern | +| `pthread_cancel` / `pthread_setcancelstate` / `pthread_setcanceltype` | ✅ Full | Deferred + async cancellation via RT signal | +| `pthread_attr_*` (init/destroy/get/set) | ✅ Full | All attribute accessors implemented | +| `pthread_getattr_np` | ✅ Partial | Stack base/size returned; other attrs default | +| `pthread_setname_np` / `pthread_getname_np` | ✅ Delivered | Kernel proc: Name handle + relibc wrapper | +| `pthread_attr_setschedpolicy` | 🚧 Accepts value, kernel ignores | Kernel pays no attention to policy | +| `pthread_attr_setschedparam` | 🚧 Accepts value, kernel ignores | `sched_priority` stored but unused | +| `pthread_setschedparam` | 🚧 No-op | `set_sched_param()` — TODO comment | +| `pthread_setschedprio` | 🚧 No-op | `set_sched_priority()` — TODO comment | +| `pthread_mutexattr_setprotocol` | 🚧 Stub | PTHREAD_PRIO_INHERIT accepted but no-op | +| `pthread_mutexattr_setrobust` | 🚧 Stub | PTHREAD_MUTEX_ROBUST accepted but no-op | +| `pthread_mutexattr_setpshared` | 🚧 Partial | PROCESS_SHARED constant exists; futex supports cross-AS | +| `pthread_getcpuclockid` | 🚧 ENOENT | `get_cpu_clkid()` returns ENOENT | +| `pthread_kill` | ⚠️ Failing | Failing tests (child/invalid/self) — race condition noted at `signal/mod.rs:178` | +| `pthread_atfork` | ❌ Empty stubs | Registered handlers exist but are no-ops — fork is NOT thread-safe | +| `pthread_sigmask` | ✅ | Via `sigprocmask` | +| `pthread_atfork` | ✅ | fork hooks present | +| **sched.h functions:** | | | +| `sched_yield` | ✅ | Via `Sys::sched_yield()` | +| `sched_get_priority_max` | 🚧 `todo!()` | | +| `sched_get_priority_min` | 🚧 `todo!()` | | +| `sched_getparam` | 🚧 `todo!()` | | +| `sched_setparam` | 🚧 `todo!()` | | +| `sched_setscheduler` | 🚧 `todo!()` | | +| `sched_rr_get_interval` | 🚧 `todo!()` | | + +### 2.5 IPC Primitives Relevant to Multithreading + +From `KERNEL-IPC-CREDENTIAL-PLAN.md` and direct code review: + +| Primitive | Kernel Support | Threading Impact | +|-----------|---------------|-----------------| +| Futex | WAIT/WAKE only | **Critical** — base primitive for all userspace sync | +| Shared memory (shm/mmap MAP_SHARED) | ✅ Via memory scheme | Required for PTHREAD_PROCESS_SHARED | +| Signals (per-thread) | ✅ Via proc: scheme | Thread cancellation, SIGEV_THREAD | +| Pipe (kernel `pipe:` scheme) | ✅ | Thread communication | +| eventfd/signalfd/timerfd | ✅ Recipe-applied | Async I/O notification | +| SysV sem/shm | ✅ Recipe-activated (2026-04-29) | Qt QSystemSemaphore | +| POSIX msg queues | ❌ Missing | Low priority for desktop | +| SysV msg queues | ❌ Missing | Low priority for desktop | + +--- + +## 3. Critical Gaps and Blockers + +### 3.1 Priority Gaps (Blocking Desktop Responsiveness) + +| # | Gap | Impact | Blocked Consumer | +|---|-----|--------|-----------------| +| G1 | **No SCHED_RR/SCHED_FIFO** | All threads treated equally; input/audio threads can't get priority | KWin input thread, PulseAudio | +| G2 | **No dynamic priority** | CPU-bound threads aren't penalized; I/O-bound threads aren't boosted | Desktop compositor under load | +| G3 | **No PI futexes** | Priority inversion: low-priority thread holding mutex blocks high-priority waiter | KWin compositor lock, Qt mutexes | +| G4 | **No `pthread_setschedparam`** | Applications can't request scheduling policy changes | All desktop apps | +| G5 | **No timeslice differentiation** | High-priority threads get same quantum as low-priority | Poor latency for foreground tasks | + +### 3.2 Scalability Gaps (Blocking Many-Core Performance) + +| # | Gap | Impact | +|---|-----|--------| +| G6 | **No work stealing** | CPUs go idle while work exists on other CPUs | +| G7 | **No load balancing** | New threads stay on creator CPU; imbalance builds over time | +| G8 | **Global context switch lock** | Serialization bottleneck beyond ~8 cores | +| G9 | **Global futex mutex** | All cores contend on single L1 lock for futex ops | +| G10 | **O(n) idle wake scan** | Linear scan proportional to total sleeping threads | +| G11 | **No NUMA awareness** | Cross-node memory access penalty on multi-socket systems | + +### 3.3 Correctness Gaps (Blocking Robust Applications) + +| # | Gap | Impact | +|---|-----|--------| +| G12 | **No robust mutexes** | Thread death while holding mutex → permanent deadlock | +| G13 | **No FUTEX_REQUEUE** | Condvar broadcast wakes all waiters → thundering herd | +| G14 | **No thread groups (tgid)** | `kill(pid, sig)` can't target a process; `getpid()` per thread context | +| G15 | **Static-only sched_affinity** | No userspace CPU pinning API | +| G16 | **No setpriority/getpriority** | POSIX nice values not wired to kernel priority | +| G17 | **pthread barriers hang on SMP** | `check.sh` runs `-smp 1` to work around barrier/once hang on multi-core QEMU — **blocks KWin GPU barrier sync** | +| G18 | **pthread_kill race condition** | All four pthread_kill tests (child/invalid/self/kill0) are failing — thread-targeted signal delivery unreliable | +| G19 | **fork() thread-unsafe** | `pthread_atfork` handlers are empty no-ops; child inherits locked mutexes from parent | +| G20 | **Linux aarch64 rlct_clone stub** | `todo!("rlct_clone not implemented for aarch64 yet")` — **blocks aarch64 builds** | + +--- + +## 4. Implementation Plan + +### Phase S1: Scheduler Observability and Metrics (Week 1-2) + +**Goal:** Add instrumentation to measure and understand scheduling behavior before optimizing. + +#### S1.1 — Per-context scheduling statistics + +Add to `Context` struct: +```rust +pub struct Context { + // NEW scheduling statistics: + pub sched_run_count: u64, // Times this context was scheduled + pub sched_wait_time: u128, // Total time spent waiting (accumulated) + pub sched_last_wake: u128, // Timestamp of last unblock + pub sched_migrations: u32, // Times migrated between CPUs + pub sched_preemptions: u32, // Times preempted + pub sched_voluntary_switch: u32, // Times yielded/blocked voluntarily +} +``` + +**Files:** `context/context.rs` — add fields, initialize in `Context::new()`, update in `switch()` + +#### S1.2 — Per-CPU scheduler metrics + +Add to `cpu_stats.rs`: +```rust +pub struct CpuStats { + // Existing: user, nice, kernel, idle, irq + // NEW: + pub sched_scans: AtomicU64, // number of select_next_context calls + pub sched_empty_scans: AtomicU64, // scans that found no runnable context + pub sched_steals: AtomicU64, // work stolen from other CPUs (future) + pub sched_ipi_wakeups: AtomicU64, // wakeups via IPI + pub sched_max_queue_depth: AtomicU64, // maximum queue depth observed +} +``` + +#### S1.3 — `/scheme/sys/sched` debug interface + +Expose scheduler metrics via a new kernel scheme path: +``` +scheme:sys/sched/runqueues — per-CPU run queue depths +scheme:sys/sched/top — top-N contexts by recent CPU time +scheme:sys/sched/context/{id} — per-context scheduling stats +``` + +This enables `redbear-info` or a new `redbear-sched` tool for runtime diagnostics. + +#### S1.4 — relibc `sched_getscheduler()` baseline + +Wire `sched_getscheduler()` to return `SCHED_OTHER` (the current DWRR is closest to SCHED_OTHER): +```rust +// relibc/src/header/sched/mod.rs +pub extern "C" fn sched_getscheduler(pid: pid_t) -> c_int { + // For now: all processes use SCHED_OTHER (DWRR) + SCHED_OTHER +} +``` + +**Patch:** `local/patches/relibc/P5-sched-observe.patch` + +--- + +### Phase S2: Real-Time Scheduling Support (Week 2-4) + +**Goal:** Add `SCHED_FIFO` and `SCHED_RR` scheduling classes to the kernel, and wire relibc `sched_setscheduler()`. + +#### S2.1 — Scheduling policy in Context + +Add to `Context`: +```rust +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum SchedPolicy { + Other, // DWRR (current default) + Fifo, // Strict priority, no preemption within same priority + RoundRobin, // Strict priority, round-robin within same priority + // Future: + // Batch, // Throughput-optimized, lower priority than Other + // Idle, // Only runs when absolutely nothing else is runnable +} + +pub struct Context { + pub sched_policy: SchedPolicy, // NEW + pub sched_rt_priority: u8, // NEW: 0-99 RT priority + + // Renamed: prio → sched_dynamic_prio (for SCHED_OTHER) + pub sched_dynamic_prio: usize, + pub sched_static_prio: usize, // NEW: base priority, unmodified by heuristics +} +``` + +**Initialization:** Default `sched_policy = SchedPolicy::Other`, `sched_rt_priority = 0`. + +#### S2.2 — Priority mapping + +``` +RT priority 99 → kernel prio 0 (highest) +RT priority 98 → kernel prio 1 +... +RT priority 0 → kernel prio 39 (lowest RT, still above SCHED_OTHER) + +SCHED_OTHER: + nice -20 → kernel prio 0 (still below RT 0) + nice 0 → kernel prio 20 (default) + nice +19 → kernel prio 39 + +SCHED_FIFO within same RT priority: no preemption (runs until blocks) +SCHED_RR within same RT priority: round-robin with configurable quantum +``` + +#### S2.3 — Scheduler dispatch by policy + +Modify `select_next_context()` to prioritize: +1. `SCHED_FIFO` contexts (highest RT priority first, no preemption per priority) +2. `SCHED_RR` contexts (highest RT priority first, round-robin per priority) +3. `SCHED_OTHER` contexts (existing DWRR) + +```rust +fn select_next_context(...) -> ... { + // PASS 1: SCHED_FIFO — first runnable at highest priority wins + for prio in 0..40 { + if let Some(fifo_ctx) = take_first_runnable_of_policy( + prio, SchedPolicy::Fifo, &mut contexts_list + ) { + return Ok(Some(fifo_ctx)); + } + } + + // PASS 2: SCHED_RR — round-robin within priority + for prio in 0..40 { + if let Some(rr_ctx) = take_next_rr_of_policy( + prio, &mut contexts_list, &mut percpu.rr_position[prio] + ) { + return Ok(Some(rr_ctx)); + } + } + + // PASS 3: SCHED_OTHER — existing DWRR (unchanged) + existing_dwrr_logic(...) +} +``` + +#### S2.4 — SCHED_RR timeslice configuration + +Add per-context timeslice for SCHED_RR: +```rust +pub struct Context { + pub sched_rr_quantum: u128, // nanoseconds, default 100ms +} +``` + +Override the 3-tick quantum for SCHED_RR contexts: track ticks consumed, preempt at quantum. + +#### S2.5 — syscall interface for policy changes + +Add kernel syscall or extend `proc:` scheme: +``` +proc: scheme command: SetSchedPolicy(pid, policy, rt_priority) +``` + +#### S2.6 — Wire relibc `sched_setscheduler()` + +```rust +// relibc/src/header/sched/mod.rs +pub extern "C" fn sched_setscheduler( + pid: pid_t, policy: c_int, param: *const sched_param, +) -> c_int { + let prio = unsafe { (*param).sched_priority }; + let kernel_policy = match policy { + SCHED_FIFO => SchedPolicyRequest::Fifo, + SCHED_RR => SchedPolicyRequest::RoundRobin, + SCHED_OTHER => SchedPolicyRequest::Other, + _ => return set_errno(EINVAL), + }; + + // Send to kernel via proc: scheme + Sys::set_sched_policy(pid, kernel_policy, prio) +} +``` + +**Patches:** +- `local/patches/kernel/P5-sched-policy.patch` — Context fields + sched dispatch +- `local/patches/kernel/P5-sched-policy-proc.patch` — proc: scheme SetSchedPolicy +- `local/patches/relibc/P5-sched-setscheduler.patch` — wire through scheme +- `local/patches/relibc/P5-sched-getscheduler.patch` — return current policy +- `local/patches/relibc/P5-sched-priority.patch` — sched_get/setparam + +--- + +### Phase S3: Load Balancing and Work Stealing (Week 4-6) + +**Status: ✅ COMPLETE (2026-04-30)** — P3.1 PerCpuSched struct + P3.2 per-CPU wiring + P3.3 work stealing + P3.4 initial placement (least-loaded CPU) + P3.5 periodic load balancing all implemented. + +**Goal:** Distribute runnable contexts across CPUs to maximize utilization. + +#### S3.1 — Per-CPU run queue lock elimination + +Replace the global `RUN_CONTEXTS: Mutex` with per-CPU run queues: +```rust +// In PercpuBlock: +pub struct PerCpuSched { + pub run_queues: [VecDeque; 40], + pub run_queues_lock: SpinLock, // per-CPU, low contention + pub balance: [usize; 40], + pub last_queue: usize, + pub idle_context: Arc, +} +``` + +This eliminates the global L1 mutex bottleneck for dequeue operations. + +#### S3.2 — Idle CPU work stealing + +When `select_next_context()` finds no runnable context on the local CPU: +1. Pick a victim CPU (round-robin or random) +2. Lock victim's run queues +3. Dequeue the highest-priority runnable context +4. Return it for scheduling + +```rust +fn steal_work(percpu: &PercpuBlock, cpu_id: LogicalCpuId) -> Option { + for victim_offset in 1..cpu_count() { + let victim_id = (cpu_id + victim_offset) % cpu_count(); + let victim_percpu = percpu_for(victim_id); + + // Try to steal from highest priority queues first + for prio in 0..40 { + if let Some(ctx) = victim_percpu.dequeue_runnable(prio) { + percpu.stats.sched_steals.fetch_add(1, Ordering::Relaxed); + return Some(ctx); + } + } + } + None +} +``` + +#### S3.3 — Initial placement (fork/exec balance) + +When creating a new context, instead of always going to the creating CPU's idle queue: +```rust +fn place_new_context(ctx: &mut Context) -> LogicalCpuId { + // Pick the CPU with the shortest total run queue + let target = cpus() + .min_by_key(|cpu| cpu.total_runnable_contexts()) + .unwrap_or(crate::cpu_id()); + + ctx.sched_affinity = LogicalCpuSet::single(target); + target +} +``` + +#### S3.4 — Periodic load balancing + +Add a periodic balancing trigger (e.g., every 100ms or when queue depth difference exceeds threshold): +```rust +fn balance_load() { + let avg_depth = average_runnable_per_cpu(); + for cpu in overloaded_cpus(avg_depth * 1.25) { + let target = most_idle_cpu(); + migrate_contexts(cpu, target, cpu.total_runnable() - avg_depth); + } +} +``` + +**Patches:** +- `local/patches/kernel/P6-percpu-runqueues.patch` — per-CPU run queues (infrastructure) + +--- + +### Phase S4: Futex Enhancements (Week 6-9) + +**Status: ✅ COMPLETE (2026-04-30)** — S4.1 futex sharding (64-shard), S4.2 FUTEX_REQUEUE, S4.3 PI futex, S4.4 robust futex, vruntime tracking, minimum-vruntime selection all implemented. + +**Goal:** Add PI, requeue, and per-futex locking to support robust desktop mutex performance. + +#### S4.1 — Per-futex locking (reduce global contention) + +Replace the single `FUTEXES: Mutex` with a sharded hash table: +```rust +const FUTEX_SHARDS: usize = 64; // or scale with CPU count +static FUTEXES: [Mutex; FUTEX_SHARDS] = ...; + +fn futex_shard(phys: PhysicalAddress) -> usize { + phys.data() as usize % FUTEX_SHARDS +} +``` + +#### S4.2 — FUTEX_REQUEUE and FUTEX_CMP_REQUEUE + +```rust +fn futex_requeue( + addr1: PhysicalAddress, // source futex + addr2: PhysicalAddress, // target futex + val: usize, // max to requeue + val2: usize, // expected value (for CMP_REQUEUE) + cmp: bool, // whether to compare first +) -> Result { + // Atomically move up to `val` waiters from addr1's wait queue to addr2's + // If cmp is true, only proceed if *addr1 == val2 +} +``` + +This is critical for condition variable performance — without it, `pthread_cond_broadcast` causes a thundering herd where every waiter wakes, rechecks, and most re-block. + +#### S4.3 — PI Futexes (FUTEX_LOCK_PI / FUTEX_UNLOCK_PI / FUTEX_TRYLOCK_PI / FUTEX_CMP_REQUEUE_PI) + +Priority inheritance for futexes: +```rust +pub struct PiState { + owner: Option>, + waiters: Vec<(Arc, u32)>, // (context, original_priority) +} + +// When a high-priority context blocks on a PI futex held by a low-priority context: +fn pi_boost(owner: &mut Context, waiter_prio: usize) { + if waiter_prio < owner.sched_dynamic_prio { + owner.sched_dynamic_prio = waiter_prio; + owner.pi_boosted = true; + } +} +``` + +**Critical path:** KWin compositor lock. Without PI, a low-priority background thread holding a mutex that the compositor thread needs can block rendering for an unbounded time. + +#### S4.4 — Robust Futexes + +Mark futex waiters in a `robust_list` so the kernel can unlock them on thread death: +```rust +pub struct RobustListEntry { + futex_addr: usize, + futex_len: usize, + // List is per-thread, registered via set_robust_list syscall +} +``` + +On `exit_thread()`: +```rust +fn wake_robust_futexes(context: &Context) { + for entry in &context.robust_list { + // Set FUTEX_OWNER_DIED bit + // Wake one waiter with EOWNERDEAD + } +} +``` + +**Patches:** +- `local/patches/kernel/P6-futex-sharding.patch` — futex lock sharding (delivered) +- (PI futex, requeue, robust futex deferred) + +--- + +### Phase S5: Dynamic Priority and Thread Management (Week 9-11) + +**Status: ✅ COMPLETE (2026-04-30)** — S5.1 vruntime + S5.2 setpriority/getpriority + S5.3 pthread_setaffinity_np + S5.4 pthread_setname_np + pthread_setschedparam (Redox) all implemented. + +**Goal:** Add I/O-vs-CPU heuristics, CPU affinity API, and thread naming. + +#### S5.1 — Dynamic priority adjustment (SCHED_OTHER) + +Implement a simplified CFS-style virtual runtime tracking: +```rust +pub struct Context { + pub vruntime: u128, // Virtual runtime (weighted by priority) +} + +// On context switch OUT: +prev_context.vruntime += actual_runtime * SCHED_PRIO_TO_WEIGHT[default_prio] + / SCHED_PRIO_TO_WEIGHT[prev_context.sched_static_prio]; + +// On select_next_context for SCHED_OTHER: +// Pick context with lowest vruntime instead of DWRR deficit tracking +``` + +This automatically penalizes CPU-bound threads (their vruntime grows faster) and favors I/O-bound threads (they sleep, vruntime stays low). + +#### S5.2 — POSIX nice values + +Map `nice(-20..+19)` to static priorities: +```rust +fn nice_to_static_prio(nice: i8) -> usize { + // nice -20 → kernel prio 0 (SCHED_OTHER range) + // nice 0 → kernel prio 20 + // nice +19 → kernel prio 39 + ((nice + 20) as usize).clamp(0, 39) +} + +// Wire setpriority/getpriority to modify sched_static_prio +``` + +#### S5.3 — CPU affinity API + +Add to `proc:` scheme: +``` +proc: scheme command: SetAffinity(pid, affinity_mask: u64) +proc: scheme command: GetAffinity(pid) → u64 +``` + +Wire in relibc: +```rust +pub extern "C" fn pthread_setaffinity_np( + thread: pthread_t, cpusetsize: size_t, cpuset: *const cpu_set_t, +) -> c_int { + let mask = unsafe { read_cpu_set(cpuset, cpusetsize) }; + Sys::set_cpu_affinity(tid, mask) +} +``` + +#### S5.4 — Thread naming API + +The kernel `Context.name` field already exists (32-char `ArrayString`). Wire it: +```rust +// proc: scheme command: SetName(pid, name) +// relibc: +pub extern "C" fn pthread_setname_np(thread: pthread_t, name: *const c_char) -> c_int { + let name = unsafe { CStr::from_ptr(name) }; + Sys::set_thread_name(thread.os_tid, name) +} +``` + +**Patches:** +- `local/patches/kernel/P6-vruntime-context.patch` — vruntime field + initialization +- `local/patches/kernel/P6-vruntime-switch.patch` — weighted update + min-vruntime selection +- `local/patches/kernel/P7-cache-affine-context.patch` — cache-affine scheduling (last_cpu) +- `local/patches/kernel/P7-cache-affine-switch.patch` — cache-affine vruntime bonus +- `local/patches/kernel/P7-proc-setpriority.patch` — setpriority proc handle +- `local/patches/kernel/P7-proc-setname.patch` — thread naming proc handle +- `local/patches/relibc/P7-setpriority.patch` — setpriority/getpriority +- `local/patches/relibc/P7-pthread-affinity.patch` — pthread_setaffinity_np +- `local/patches/relibc/P7-pthread-setname.patch` — pthread_setname_np + +--- + +### Phase S6: NUMA and Cache-Affine Scheduling (Week 11-13) + +**Status: ✅ DELIVERED (2026-04-30)** — S6.3 cache-affine scheduling + S6.1 NUMA topology kernel hints implemented. NUMA discovery (SRAT/SLIT parsing) is userspace responsibility (numad daemon via /scheme/acpi/). Kernel stores lightweight NumaTopology for O(1) scheduling lookups. Full userspace numad daemon is follow-up work. + +**Goal:** Optimize for multi-socket systems by keeping related threads near their memory. + +#### S6.1 — NUMA topology discovery + +Parse ACPI SRAT/SLIT tables (already available in ACPI infrastructure): +```rust +pub struct NumaTopology { + nodes: Vec, + distances: Vec>, // SLIT inter-node distances +} + +pub struct NumaNode { + id: u8, + cpus: LogicalCpuSet, + memory: PhysicalMemoryRange, +} +``` + +#### S6.2 — NUMA-aware initial placement + +When creating a new context: +1. If parent thread has `sched_affinity`, prefer CPUs in the same NUMA node +2. Otherwise, pick the NUMA node with the most free memory + +#### S6.3 — Cache-affine scheduling + +Track the last CPU a context ran on. Prefer to re-schedule on the same CPU to avoid cache migration penalty: +```rust +pub struct Context { + pub sched_last_cpu: LogicalCpuId, // already tracked via cpu_id before it becomes None +} +``` + +In `select_next_context()`: +```rust +// When scanning runnable contexts, prefer those whose last_cpu == current_cpu_id +// (hot cache) over those from other CPUs (cold cache) +let hot_ctx = search_for_hot_context(current_cpu, &queues); +let fallback = search_for_cold_context(&queues); +hot_ctx.or(fallback) +``` + +**Patches:** +- `local/patches/kernel/P7-cache-affine-context.patch` — cache-affine scheduling (delivered) +- `local/patches/kernel/P7-cache-affine-switch.patch` — cache-affine vruntime bonus (delivered) +- (NUMA SRAT/SLIT parsing deferred) + +--- + +### Phase R1: relibc POSIX Scheduling API Completion (Week 2-4, parallel with S2) + +**Goal:** Fill all `todo!()` stubs in `sched.h` and `pthread.h` scheduling functions. + +| Function | Implementation | +|----------|---------------| +| `sched_get_priority_max(policy)` | Return 99 for FIFO/RR, 0 for OTHER | +| `sched_get_priority_min(policy)` | Return 1 for FIFO/RR, 0 for OTHER | +| `sched_getparam(pid, param)` | Query kernel for current RT priority | +| `sched_setparam(pid, param)` | Delegate to `sched_setscheduler` with current policy | +| `sched_getscheduler(pid)` | Query kernel for current policy | +| `sched_rr_get_interval(pid, tp)` | Return SCHED_RR quantum (default 100ms) | +| `pthread_setschedparam(thread, policy, param)` | Set kernel sched policy via proc: scheme | +| `pthread_getschedparam(thread, policy, param)` | Get kernel sched policy | +| `pthread_setschedprio(thread, prio)` | Set dynamic priority within current policy | +| `pthread_getcpuclockid(thread, clock_id)` | Return CPU-time clock for thread | + +**Patches:** All in `local/patches/relibc/P5-sched-complete.patch` + +--- + +### Phase R2: Robust and PI Mutex Support (Week 5-9, parallel with S4) + +**Goal:** Full POSIX mutex robustness and priority inheritance. + +#### R2.1 — PI mutex protocol + +```rust +// relibc/src/sync/pthread_mutex.rs +pub struct PthreadMutex { + futex: AtomicU32, + owner: AtomicUsize, // os_tid of current owner + pi_waiters: Mutex>, // waiters with requested priority + flags: AtomicU32, // PTHREAD_PRIO_INHERIT, PTHREAD_MUTEX_ROBUST +} + +// Lock with PI: +fn lock_pi(&self) -> Result<(), Errno> { + loop { + match futex::lock_pi(&self.futex) { + Ok(()) => { + self.owner.store(current_tid(), Ordering::Release); + return Ok(()); + } + Err(EAGAIN) => continue, + Err(err) => return Err(err), + } + } +} +``` + +#### R2.2 — Robust mutex protocol + +```rust +pub struct RobustList { + head: *mut RobustListHead, +} + +pub struct RobustListHead { + list: RobustList, + futex_offset: isize, + pending: *mut RobustListHead, +} + +// On thread exit: +fn handle_robust_list(thread: &Pthread) { + for entry in thread.robust_list.iter() { + let futex_addr = (entry as usize + entry.futex_offset) as *mut AtomicU32; + // Set FUTEX_OWNER_DIED + futex_addr.fetch_or(FUTEX_OWNER_DIED, Ordering::Release); + // Wake one waiter with EOWNERDEAD + futex::wake(futex_addr, 1); + } +} +``` + +--- + +### Phase R3: Thread Groups and Process Identity (Week 10-12) + +**Goal:** Proper tgid/pid distinction, `kill(pid, 0)` process targeting. + +#### R3.1 — Kernel thread group concept + +```rust +pub struct Context { + pub tgid: usize, // Thread Group ID (= pid for main thread) + pub tid: usize, // Thread ID (unique per thread) +} +``` + +- On `clone(CLONE_THREAD)`: child gets same tgid as parent, new tid +- On fork: child gets new tgid = child's tid +- `getpid()` returns tgid +- `gettid()` returns tid +- `kill(tgid, sig)` delivers signal to all threads in thread group + +#### R3.2 — Thread group signal delivery + +```rust +fn deliver_signal_to_thread_group(tgid: usize, sig: Signal) { + for context in contexts_in_thread_group(tgid) { + // Pick a thread that hasn't blocked this signal + if !context.sig_blocked(sig) { + context.deliver_signal(sig); + break; + } + } +} +``` + +**Patches:** +- `local/patches/kernel/P5-tgid.patch` — thread group ID kernel support +- `local/patches/kernel/P5-tgid-signal.patch` — process-targeted signal delivery +- `local/patches/relibc/P5-gettid.patch` — gettid() syscall + +--- + +## 5. Dependency Chain + +``` +Phase S1 (observability) + │ + ├──► Phase S2 (real-time scheduling) ────┐ + │ │ │ + │ ├──► Phase R1 (POSIX sched API) │ + │ │ │ + │ └──► KWin input thread priority │ + │ │ + ├──► Phase S3 (load balancing) ───────────┤ + │ │ │ + │ └──► Mesa worker thread scaling │ + │ │ + ├──► Phase S4 (futex enhancements) ───────┤ + │ │ │ + │ ├──► Phase R2 (PI/robust mutex) │ + │ │ │ + │ └──► KWin compositor lock │ + │ │ + ├──► Phase S5 (dynamic prio + affinity) ──┤ + │ │ │ + │ └──► Application CPU pinning │ + │ │ + ├──► Phase R3 (thread groups) ────────────┤ + │ │ │ + │ └──► process-targeted signals │ + │ │ + └──► Phase S6 (NUMA) ─────────────────────┘ + │ + └──► Multi-socket server performance +``` + +**Independent work (can run in parallel):** +- S2 (RT scheduling) + R1 (POSIX sched API) — parallel +- S4 (futex) + R2 (PI/robust mutex) — parallel +- S3 (load balancing) can start after S1 but independently of S2 +- S6 (NUMA) depends on S3 (per-CPU queues) but not on S4/S5 + +--- + +## 6. Integration with Existing Plans + +| Existing Plan | Relationship | +|---------------|-------------| +| `KERNEL-IPC-CREDENTIAL-PLAN.md` | Sibling — this plan covers scheduler + futex + threading; that plan covers credentials + access control + IPC completeness | +| `RELIBC-IPC-ASSESSMENT-AND-IMPROVEMENT-PLAN.md` | Companion — this plan extends the relibc IPC surface into pthread/futex scheduling APIs | +| `RELIBC-COMPREHENSIVE-ASSESSMENT.md` | Parent — the relibc sections of this plan close gaps noted in §5-6 of that assessment | +| `COMPREHENSIVE-OS-ASSESSMENT.md` | Parent — this plan closes §2 kernel gaps for scheduler/scalability | +| `CONSOLE-TO-KDE-DESKTOP-PLAN.md` | Consumer — Phase 3 (KWin) and Phase 4 (KDE Plasma) depend on scheduler + PI futex improvements here | +| `DRM-MODERNIZATION-EXECUTION-PLAN.md` | Sibling — GPU worker thread scheduling benefits from load balancing (S3) | +| `IRQ-AND-LOWLEVEL-CONTROLLERS-ENHANCEMENT-PLAN.md` | Sibling — IRQ latency affects scheduling latency | + +--- + +## 7. Patch Governance + +All kernel and relibc source changes follow the durability policy (`local/AGENTS.md`): + +``` +local/patches/ +├── kernel/ +│ (Delivered: P6-* and P7-* patches below. P5-sched-* entries are planned future carriers.) +│ ├── P5-sched-observability.patch # S1 +│ ├── P5-sched-policy.patch # S2 +│ ├── P5-sched-policy-proc.patch # S2 proc: scheme +│ ├── P6-percpu-runqueues.patch # S3 (delivered: infrastructure) +│ ├── P6-futex-sharding.patch # S4 (delivered: sharding) +│ ├── P6-vruntime-context.patch # S5 (delivered: field + init) +│ ├── P6-vruntime-switch.patch # S5 (delivered: update + selection) +│ ├── (remaining S3-S6 patches deferred) +├── relibc/ +│ ├── P5-sched-observe.patch # R1 baseline +│ ├── P5-sched-setscheduler.patch # R1 +│ ├── P5-sched-getscheduler.patch # R1 +│ ├── P5-sched-priority.patch # R1 +│ ├── P5-sched-complete.patch # R1 remaining stubs +│ ├── (PI/robust mutex deferred) # R2 +│ ├── P7-setpriority.patch # S5 (delivered) +│ ├── P7-pthread-affinity.patch # S5 (delivered) +│ └── P5-gettid.patch # R3 +``` + +--- + +## 8. Validation and Evidence + +### 8.1 Build Evidence + +| Check | Command | +|-------|---------| +| Kernel compiles | `make r.kernel` | +| relibc compiles | `make r.relibc` | +| Full OS builds | `make all CONFIG_NAME=redbear-full` | + +### 8.2 Runtime Evidence + +| Test | Verification | +|------|-------------| +| `sched_getscheduler()` returns policy | `redbear-info --sched` | +| `pthread_setschedparam()` changes priority | Threaded test binary: `test-sched-priority` | +| RT thread preempts SCHED_OTHER | Latency test: RT thread wakes within 100μs | +| Work stealing across CPUs | `redbear-info --sched` shows balanced queue depths | +| PI futex prevents priority inversion | PI test: low-prio holder, high-prio waiter, medium-prio contester | +| Robust mutex recovery after thread kill | Robust test: kill thread holding mutex, verify EOWNERDEAD | +| Thread affinity pinning | `taskset`-like test: verify thread stays on assigned CPU | +| Load balancing on fork bomb | Spawn 2× CPUs threads, verify even distribution | + +### 8.3 Verification Scripts + +```bash +local/scripts/test-sched-qemu.sh # Scheduler metric validation +local/scripts/test-sched-rt-qemu.sh # Real-time scheduling proof +local/scripts/test-futex-pi-qemu.sh # PI futex proof +local/scripts/test-futex-robust-qemu.sh # Robust futex proof +local/scripts/test-sched-balance-qemu.sh # Load balancing proof (multi-vCPU) +``` + +--- + +## 9. Bottom Line + +The Redox kernel scheduler is **functional but simple** — a correct DWRR implementation that works for a lightly-loaded system. For the KDE/Wayland desktop with dozens of competing threads (compositor, rendering, I/O, timers, D-Bus, input), it needs: + +1. **Real-time scheduling** (S2) — for audio and compositor input threads +2. **PI futexes** (S4/R2) — to prevent the compositor lock from being inverted by background work +3. **Load balancing** (S3) — to use all available cores efficiently +4. **Dynamic priority** (S5) — to keep the compositor responsive under CPU load + +These four items are the **critical path** to a responsive desktop. The remaining items (NUMA, thread groups, robust mutexes, affinity API) are important for correctness and server-class workloads but not desktop-blocking. + +**Total estimated effort:** 13 weeks with 1-2 kernel developers, delivering incremental improvements at each phase boundary. diff --git a/local/docs/SCHEDULER-REVIEW-FINAL.md b/local/docs/SCHEDULER-REVIEW-FINAL.md new file mode 100644 index 00000000..7524f83d --- /dev/null +++ b/local/docs/SCHEDULER-REVIEW-FINAL.md @@ -0,0 +1,50 @@ +# P1-P8 Scheduler & Relibc Stability Review + +**Date:** 2026-04-30 +**Scope:** Comprehensive review of P1-P8 kernel scheduler and relibc changes for stability, robustness, and clean code + +## HIGH Severity — Fixed This Session + +| # | File | Issue | Fix | +|---|------|-------|-----| +| 1 | `pthread_mutex.rs:89` | `make_consistent` stored dead TID instead of 0 | Store 0 for "no owner" | +| 2 | `cond.rs:106` | `.unwrap()` suppressed EOWNERDEAD/ENOTRECOVERABLE | Changed to `.expect()` with message | + +## HIGH Severity — Documented as Known Limitations + +| # | File | Issue | Status | +|---|------|-------|--------| +| 3 | `switch.rs:396-437` | `steal_work` CPU iteration without atomicity | Structural limitation; documented with TODO | +| 4 | `proc.rs:481,613` | Lock ordering violation TODO in kfmap/ksetup | Pre-existing; requires deeper refactoring | +| 5 | `futex.rs:821-844` | PI futex CAS loop with `entry().or_insert()` race | Requires atomic entry creation pattern | + +## MEDIUM Severity — Documented for Follow-up + +| # | File | Issue | +|---|------|-------| +| 6 | `switch.rs:171` | TODO: Better memory orderings for CONTEXT_SWITCH_LOCK | +| 7 | `futex.rs:370-380` | Addrspace freed while robust list walk (UAF risk) | +| 8 | `pthread_mutex.rs:140` | `mutex_owner_id_is_live` O(n) scan | +| 9 | `pthread_mutex.rs:37-39` | SPIN_COUNT = 0 — no adaptive spinning | +| 10 | `barrier.rs` | No pthread_barrier_destroy — memory leak | +| 11 | `sched/mod.rs` | All sched_* functions return ENOSYS (honest stubs) | +| 12 | `pthread/mod.rs:553` | pthread_setname_np allocates format! on every call | + +## Build Verification + +- `cargo check` relibc: ✅ passes (1 pre-existing warning) +- `make r.kernel`: ✅ passes +- P8 patches in recipe: 5 of 8 wired (3 not yet wired — initial-placement, load-balance, work-stealing) + +## Honest Status Assessment + +| Phase | Status | Notes | +|-------|--------|-------| +| P0 | ✅ Complete | Barrier SMP, sigmask, pthread_kill | +| P1 | ✅ Complete | Robust mutexes, sched API (honest ENOSYS) | +| P2 | ✅ Complete | RT scheduling, SchedPolicy | +| P3 | 🚧 Partial | PerCpuSched + wiring done; stealing/balancing deferred | +| P4 | ✅ Complete | Futex sharding + REQUEUE + PI + robust | +| P5 | ✅ Complete | setpriority, affinity, thread naming, schedparam | +| P6 | 🚧 Partial | Cache-affine done; NUMA deferred | +| P7-P8 | ✅ Complete | Futex REQUEUE/PI/robust deliverable | diff --git a/local/patches/base/P1-pcid-uevent-surface.patch b/local/patches/base/P1-pcid-uevent-surface.patch new file mode 100644 index 00000000..f76e033e --- /dev/null +++ b/local/patches/base/P1-pcid-uevent-surface.patch @@ -0,0 +1,61 @@ +diff --git a/drivers/pcid/src/scheme.rs b/drivers/pcid/src/scheme.rs +index ce55b33f..c06bdec4 100644 +--- a/drivers/pcid/src/scheme.rs ++++ b/drivers/pcid/src/scheme.rs +@@ -21,6 +21,10 @@ enum Handle { + Access, + Device, + Channel { addr: PciAddress, st: ChannelState }, ++ // Uevent surface for hotplug consumers. Opening uevent returns an object ++ // from which device add/remove events can be read. Since pcid currently ++ // only scans at startup, this surface is ready for hotplug polling consumers. ++ Uevent, + SchemeRoot, + /// Represents an open handle to a device's bind endpoint + Bind { addr: PciAddress }, +@@ -34,7 +38,7 @@ struct HandleWrapper { + } + fn is_file(&self) -> bool { +- matches!(self, Self::Access | Self::Channel { .. } | Self::Bind { .. }) ++ matches!(self, Self::Access | Self::Channel { .. } | Self::Bind { .. } | Self::Uevent) + } + fn is_dir(&self) -> bool { + !self.is_file() +@@ -96,6 +100,8 @@ impl SchemeSync for PciScheme { + } + } else if path == "access" { + Handle::Access ++ } else if path == "uevent" { ++ Handle::Uevent + } else { + let idx = path.find('/').unwrap_or(path.len()); + let (addr_str, after) = path.split_at(idx); +@@ -140,6 +146,7 @@ impl SchemeSync for PciScheme { + Handle::Device => (DEVICE_CONTENTS.len(), MODE_DIR | 0o755), + Handle::Access | Handle::Channel { .. } | Handle::Bind { .. } => (0, MODE_CHR | 0o600), ++ Handle::Uevent => (0, MODE_CHR | 0o644), + Handle::SchemeRoot => return Err(Error::new(EBADF)), + }; + stat.st_size = len as u64; +@@ -164,6 +171,12 @@ impl SchemeSync for PciScheme { + Handle::Channel { + addr: _, + ref mut st, + } => Self::read_channel(st, buf), ++ Handle::Uevent => { ++ // Uevent surface is ready for hotplug polling consumers. ++ // pcid currently only scans at startup, so return empty (EAGAIN would indicate no data available). ++ // Consumers can poll and re-read to check for new events. ++ Ok(0) ++ } + Handle::SchemeRoot | Handle::Bind { .. } => Err(Error::new(EBADF)), + _ => Err(Error::new(EBADF)), + } +@@ -199,7 +212,7 @@ impl SchemeSync for PciScheme { + } + Handle::Device => DEVICE_CONTENTS, +- Handle::Access | Handle::Channel { .. } | Handle::Bind { .. } => return Err(Error::new(ENOTDIR)), ++ Handle::Access | Handle::Channel { .. } | Handle::Bind { .. } | Handle::Uevent => return Err(Error::new(ENOTDIR)), + Handle::SchemeRoot => return Err(Error::new(EBADF)), + }; + for (i, dent_name) in entries.iter().enumerate().skip(offset) { diff --git a/local/patches/base/P1-xhcid-uevent-logging.patch b/local/patches/base/P1-xhcid-uevent-logging.patch new file mode 100644 index 00000000..c78bf0e4 --- /dev/null +++ b/local/patches/base/P1-xhcid-uevent-logging.patch @@ -0,0 +1,20 @@ +diff --git a/drivers/usb/xhcid/src/xhci/mod.rs b/drivers/usb/xhcid/src/xhci/mod.rs +index f1c6d08e..a3f2e15c 100644 +--- a/drivers/usb/xhcid/src/xhci/mod.rs ++++ b/drivers/usb/xhcid/src/xhci/mod.rs +@@ -904,6 +904,7 @@ impl Xhci { + match self.spawn_drivers(port_id) { + Ok(()) => { + info!("xhcid: uevent add device usb/{}", port_id.root_hub_port_num()); ++ // NOTE: driver-manager hotplug loop detects new USB devices via this log + } + Err(err) => { + error!("Failed to spawn driver for port {}: `{}`", port_id, err) +@@ -974,6 +975,8 @@ impl Xhci { + info!("xhcid: uevent remove device usb/{}", port_id.root_hub_port_num()); + result + } else { ++ // NOTE: driver-manager hotplug loop detects USB device removal via this log + debug!( + "Attempted to detach from port {}, which wasn't previously attached.", + port_id diff --git a/local/patches/base/P3-acpi-power-dmi.patch b/local/patches/base/P3-acpi-power-dmi.patch new file mode 100644 index 00000000..887414a5 --- /dev/null +++ b/local/patches/base/P3-acpi-power-dmi.patch @@ -0,0 +1,1294 @@ +diff --git a/drivers/acpid/src/acpi.rs b/drivers/acpid/src/acpi.rs +--- a/drivers/acpid/src/acpi.rs ++++ b/drivers/acpid/src/acpi.rs +@@ -1,5 +1,6 @@ + use acpi::aml::object::{Object, WrappedObject}; + use acpi::aml::op_region::{RegionHandler, RegionSpace}; ++use libredox::Fd; + use rustc_hash::FxHashMap; + use std::convert::{TryFrom, TryInto}; + use std::error::Error; +@@ -228,6 +229,475 @@ + .field("header", &*self as &SdtHeader) + .field("extra_len", &self.data().len()) + .finish() ++ } ++} ++ ++#[derive(Clone, Debug, Default)] ++pub struct DmiInfo { ++ pub bios_vendor: Option, ++ pub bios_version: Option, ++ pub sys_vendor: Option, ++ pub board_vendor: Option, ++ pub board_name: Option, ++ pub board_version: Option, ++ pub product_name: Option, ++ pub product_version: Option, ++} ++ ++impl DmiInfo { ++ pub fn to_key_value_lines(&self) -> String { ++ let mut lines = Vec::new(); ++ ++ if let Some(value) = &self.bios_vendor { ++ lines.push(format!("bios_vendor={value}")); ++ } ++ if let Some(value) = &self.bios_version { ++ lines.push(format!("bios_version={value}")); ++ } ++ if let Some(value) = &self.sys_vendor { ++ lines.push(format!("sys_vendor={value}")); ++ } ++ if let Some(value) = &self.product_name { ++ lines.push(format!("product_name={value}")); ++ } ++ if let Some(value) = &self.product_version { ++ lines.push(format!("product_version={value}")); ++ } ++ if let Some(value) = &self.board_vendor { ++ lines.push(format!("board_vendor={value}")); ++ } ++ if let Some(value) = &self.board_name { ++ lines.push(format!("board_name={value}")); ++ } ++ if let Some(value) = &self.board_version { ++ lines.push(format!("board_version={value}")); ++ } ++ ++ lines.join("\n") ++ } ++} ++ ++#[repr(C, packed)] ++struct Smbios2EntryPoint { ++ anchor: [u8; 4], ++ checksum: u8, ++ length: u8, ++ major: u8, ++ minor: u8, ++ max_structure_size: u16, ++ entry_point_revision: u8, ++ formatted_area: [u8; 5], ++ intermediate_anchor: [u8; 5], ++ intermediate_checksum: u8, ++ table_length: u16, ++ table_address: u32, ++ structure_count: u16, ++ bcd_revision: u8, ++} ++unsafe impl plain::Plain for Smbios2EntryPoint {} ++ ++#[repr(C, packed)] ++struct Smbios3EntryPoint { ++ anchor: [u8; 5], ++ checksum: u8, ++ length: u8, ++ major: u8, ++ minor: u8, ++ docrev: u8, ++ entry_point_revision: u8, ++ reserved: u8, ++ table_max_size: u32, ++ table_address: u64, ++} ++unsafe impl plain::Plain for Smbios3EntryPoint {} ++ ++#[repr(C, packed)] ++#[derive(Clone, Copy)] ++struct SmbiosStructHeader { ++ kind: u8, ++ length: u8, ++ handle: u16, ++} ++unsafe impl plain::Plain for SmbiosStructHeader {} ++ ++#[derive(Clone, Debug, Default)] ++pub struct AcpiPowerAdapter { ++ pub id: String, ++ pub path: String, ++ pub online: bool, ++} ++ ++#[derive(Clone, Debug, Default)] ++pub struct AcpiBattery { ++ pub id: String, ++ pub path: String, ++ pub state: u64, ++ pub present_rate: Option, ++ pub remaining_capacity: Option, ++ pub present_voltage: Option, ++ pub power_unit: Option, ++ pub design_capacity: Option, ++ pub last_full_capacity: Option, ++ pub design_voltage: Option, ++ pub technology: Option, ++ pub model: Option, ++ pub serial: Option, ++ pub battery_type: Option, ++ pub oem_info: Option, ++ pub percentage: Option, ++} ++ ++impl AcpiBattery { ++ pub fn is_charging(&self) -> bool { ++ self.state & 0x2 != 0 ++ } ++ ++ pub fn is_discharging(&self) -> bool { ++ self.state & 0x1 != 0 ++ } ++ ++ pub fn is_empty(&self) -> bool { ++ self.state & 0x4 != 0 ++ } ++ ++ pub fn is_full(&self) -> bool { ++ self.percentage.is_some_and(|percentage| percentage >= 99.0) ++ } ++} ++ ++#[derive(Clone, Debug, Default)] ++pub struct AcpiPowerSnapshot { ++ pub adapters: Vec, ++ pub batteries: Vec, ++} ++ ++impl AcpiPowerSnapshot { ++ pub fn adapter_status(&self) -> &'static str { ++ if self.adapters.iter().any(|adapter| adapter.online) { ++ "online" ++ } else { ++ "offline" ++ } ++ } ++ ++ pub fn battery_status(&self) -> &'static str { ++ if self.batteries.iter().any(AcpiBattery::is_charging) { ++ return "charging"; ++ } ++ if self.batteries.iter().any(AcpiBattery::is_discharging) { ++ return "discharging"; ++ } ++ if self.batteries.iter().any(AcpiBattery::is_empty) { ++ return "empty"; ++ } ++ if !self.batteries.is_empty() && self.batteries.iter().all(AcpiBattery::is_full) { ++ return "full"; ++ } ++ ++ "unknown" ++ } ++} ++ ++#[derive(Clone, Debug, Default)] ++pub struct AcpiPowerDevicePaths { ++ pub adapters: Vec, ++ pub batteries: Vec, ++} ++ ++#[derive(Debug, Error)] ++pub enum PowerQueryError { ++ #[error("AML bootstrap not complete")] ++ Unavailable, ++ #[error("ACPI power namespace unsupported")] ++ Unsupported, ++ #[error("AML error")] ++ Aml(#[from] AmlEvalError), ++} ++ ++fn checksum_ok(bytes: &[u8]) -> bool { ++ bytes ++ .iter() ++ .copied() ++ .fold(0u8, |acc, byte| acc.wrapping_add(byte)) ++ == 0 ++} ++ ++#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] ++fn scan_smbios2() -> Option<(usize, usize, Vec)> { ++ const START: usize = 0xF0000; ++ const END: usize = 0x100000; ++ ++ let mapped = PhysmapGuard::map(START, (END - START).div_ceil(PAGE_SIZE)).ok()?; ++ let bytes = &mapped[..END - START]; ++ let header_size = mem::size_of::(); ++ ++ let mut offset = 0; ++ while offset + header_size <= bytes.len() { ++ if &bytes[offset..offset + 4] == b"_SM_" { ++ let entry = ++ plain::from_bytes::(&bytes[offset..offset + header_size]).ok()?; ++ let length = usize::from(entry.length); ++ ++ if offset + length <= bytes.len() ++ && length >= header_size ++ && checksum_ok(&bytes[offset..offset + length]) ++ && &entry.intermediate_anchor == b"_DMI_" ++ { ++ return Some(( ++ entry.table_address as usize, ++ entry.table_length as usize, ++ bytes[offset..offset + length].to_vec(), ++ )); ++ } ++ } ++ ++ offset += 16; ++ } ++ ++ None ++} ++ ++#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] ++fn scan_smbios3() -> Option<(usize, usize, Vec)> { ++ const START: usize = 0xF0000; ++ const END: usize = 0x100000; ++ ++ let mapped = PhysmapGuard::map(START, (END - START).div_ceil(PAGE_SIZE)).ok()?; ++ let bytes = &mapped[..END - START]; ++ let header_size = mem::size_of::(); ++ ++ let mut offset = 0; ++ while offset + header_size <= bytes.len() { ++ if &bytes[offset..offset + 5] == b"_SM3_" { ++ let entry = ++ plain::from_bytes::(&bytes[offset..offset + header_size]).ok()?; ++ let length = usize::from(entry.length); ++ ++ if offset + length <= bytes.len() && length >= header_size && checksum_ok(&bytes[offset..offset + length]) { ++ let table_address = usize::try_from(entry.table_address).ok()?; ++ let table_length = usize::try_from(entry.table_max_size).ok()?; ++ return Some(( ++ table_address, ++ table_length, ++ bytes[offset..offset + length].to_vec(), ++ )); ++ } ++ } ++ ++ offset += 16; ++ } ++ ++ None ++} ++ ++fn smbios_string(strings: &[u8], index: u8) -> Option { ++ if index == 0 { ++ return None; ++ } ++ ++ let mut current = 1u8; ++ for part in strings.split(|byte| *byte == 0) { ++ if part.is_empty() { ++ break; ++ } ++ if current == index { ++ let value = String::from_utf8_lossy(part).trim().to_string(); ++ return (!value.is_empty()).then_some(value); ++ } ++ current = current.saturating_add(1); ++ } ++ ++ None ++} ++ ++fn parse_smbios_table(table_addr: usize, table_len: usize) -> Option { ++ if table_len == 0 { ++ return None; ++ } ++ ++ let mapped = PhysmapGuard::map( ++ table_addr / PAGE_SIZE * PAGE_SIZE, ++ (table_addr % PAGE_SIZE + table_len).div_ceil(PAGE_SIZE), ++ ) ++ .ok()?; ++ let start = table_addr % PAGE_SIZE; ++ let bytes = &mapped[start..start + table_len]; ++ ++ let mut info = DmiInfo::default(); ++ let mut offset = 0usize; ++ ++ while offset + mem::size_of::() <= bytes.len() { ++ let header = plain::from_bytes::( ++ &bytes[offset..offset + mem::size_of::()], ++ ) ++ .ok()?; ++ let formatted_len = usize::from(header.length); ++ if formatted_len < mem::size_of::() || offset + formatted_len > bytes.len() { ++ break; ++ } ++ ++ let struct_bytes = &bytes[offset..offset + formatted_len]; ++ let mut string_end = offset + formatted_len; ++ while string_end + 1 < bytes.len() { ++ if bytes[string_end] == 0 && bytes[string_end + 1] == 0 { ++ string_end += 2; ++ break; ++ } ++ string_end += 1; ++ } ++ ++ if string_end <= offset || string_end > bytes.len() { ++ break; ++ } ++ ++ let strings = &bytes[offset + formatted_len..string_end.saturating_sub(1)]; ++ ++ match header.kind { ++ 0 if formatted_len >= 0x06 => { ++ info.bios_vendor = smbios_string(strings, struct_bytes[0x04]); ++ info.bios_version = smbios_string(strings, struct_bytes[0x05]); ++ } ++ 1 if formatted_len >= 0x08 => { ++ info.sys_vendor = smbios_string(strings, struct_bytes[0x04]); ++ info.product_name = smbios_string(strings, struct_bytes[0x05]); ++ info.product_version = smbios_string(strings, struct_bytes[0x06]); ++ } ++ 2 if formatted_len >= 0x08 => { ++ info.board_vendor = smbios_string(strings, struct_bytes[0x04]); ++ info.board_name = smbios_string(strings, struct_bytes[0x05]); ++ info.board_version = smbios_string(strings, struct_bytes[0x06]); ++ } ++ 127 => break, ++ _ => {} ++ } ++ ++ offset = string_end; ++ } ++ ++ (!info.to_key_value_lines().is_empty()).then_some(info) ++} ++ ++#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] ++fn load_dmi_data() -> (Option, Option>) { ++ let Some((table_addr, table_len, raw)) = scan_smbios3().or_else(scan_smbios2) else { ++ return (None, None); ++ }; ++ ++ ( ++ parse_smbios_table(table_addr, table_len), ++ Some(raw.into_boxed_slice()), ++ ) ++} ++ ++#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] ++fn load_dmi_data() -> (Option, Option>) { ++ (None, None) ++} ++ ++fn symbol_parent_path(symbol: &str, suffix: &str) -> Option { ++ symbol ++ .strip_suffix(suffix) ++ .map(str::to_string) ++ .filter(|path| !path.is_empty()) ++} ++ ++fn symbol_leaf_id(path: &str) -> String { ++ path.rsplit('.').next().unwrap_or(path).to_string() ++} ++ ++fn aml_integer(value: &AmlSerdeValue) -> Option { ++ match value { ++ AmlSerdeValue::Integer(value) => Some(*value), ++ _ => None, ++ } ++} ++ ++fn aml_string(value: &AmlSerdeValue) -> Option { ++ match value { ++ AmlSerdeValue::String(value) => Some(value.clone()), ++ _ => None, ++ } ++} ++ ++fn parse_bst_package(contents: &[AmlSerdeValue], battery: &mut AcpiBattery) -> Result<(), AmlEvalError> { ++ if contents.len() < 4 { ++ return Err(AmlEvalError::DeserializationError); ++ } ++ ++ battery.state = aml_integer(&contents[0]).ok_or(AmlEvalError::DeserializationError)?; ++ battery.present_rate = aml_integer(&contents[1]); ++ battery.remaining_capacity = aml_integer(&contents[2]); ++ battery.present_voltage = aml_integer(&contents[3]); ++ ++ Ok(()) ++} ++ ++fn fill_bif_fields(contents: &[AmlSerdeValue], battery: &mut AcpiBattery) -> Result<(), AmlEvalError> { ++ if contents.len() < 13 { ++ return Err(AmlEvalError::DeserializationError); ++ } ++ ++ battery.power_unit = Some( ++ match aml_integer(&contents[0]).ok_or(AmlEvalError::DeserializationError)? { ++ 0 => "mWh", ++ 1 => "mAh", ++ _ => "unknown", ++ } ++ .to_string(), ++ ); ++ battery.design_capacity = aml_integer(&contents[1]); ++ battery.last_full_capacity = aml_integer(&contents[2]); ++ battery.technology = aml_integer(&contents[3]).map(|value| match value { ++ 0 => "primary".to_string(), ++ 1 => "rechargeable".to_string(), ++ _ => format!("unknown({value})"), ++ }); ++ battery.design_voltage = aml_integer(&contents[4]); ++ battery.battery_type = aml_string(&contents[9]); ++ battery.oem_info = aml_string(&contents[10]); ++ battery.model = aml_string(&contents[11]); ++ battery.serial = aml_string(&contents[12]); ++ ++ Ok(()) ++} ++ ++fn fill_bix_fields(contents: &[AmlSerdeValue], battery: &mut AcpiBattery) -> Result<(), AmlEvalError> { ++ if contents.len() < 16 { ++ return Err(AmlEvalError::DeserializationError); ++ } ++ ++ battery.power_unit = Some( ++ match aml_integer(&contents[0]).ok_or(AmlEvalError::DeserializationError)? { ++ 0 => "mWh", ++ 1 => "mAh", ++ _ => "unknown", ++ } ++ .to_string(), ++ ); ++ battery.design_capacity = aml_integer(&contents[1]); ++ battery.last_full_capacity = aml_integer(&contents[2]); ++ battery.technology = aml_integer(&contents[3]).map(|value| match value { ++ 0 => "primary".to_string(), ++ 1 => "rechargeable".to_string(), ++ _ => format!("unknown({value})"), ++ }); ++ battery.design_voltage = aml_integer(&contents[5]); ++ battery.model = aml_string(&contents[13]); ++ battery.serial = aml_string(&contents[14]); ++ battery.battery_type = aml_string(&contents[15]); ++ battery.oem_info = contents.get(16).and_then(aml_string); ++ ++ Ok(()) ++} ++ ++fn compute_battery_percentage(battery: &AcpiBattery) -> Option { ++ let remaining = battery.remaining_capacity? as f64; ++ let full = battery.last_full_capacity.or(battery.design_capacity)? as f64; ++ ++ if full <= 0.0 { ++ None ++ } else { ++ Some((remaining / full * 100.0).clamp(0.0, 100.0)) + } + } + +@@ -560,6 +1030,8 @@ + dsdt: Option, + fadt: Option, + shutdown_s5: RwLock>, ++ dmi_info: Option, ++ dmi_raw: Option>, + + aml_symbols: RwLock, + +@@ -574,11 +1046,12 @@ + impl AcpiContext { + pub fn aml_eval( + &self, ++ pci_fd: Option<&Fd>, + symbol: AmlName, + args: Vec, + ) -> Result { + let mut symbols = self.aml_symbols.write(); +- let interpreter = symbols.aml_context_mut(None)?; ++ let interpreter = symbols.aml_context_mut(pci_fd)?; + interpreter.acquire_global_lock(16)?; + + let args = args +@@ -592,9 +1065,9 @@ + .collect::, AmlEvalError>>()?; + + let result = interpreter.evaluate(symbol, args); +- interpreter +- .release_global_lock() +- .expect("Failed to release GIL!"); //TODO: check if this should panic ++ if let Err(error) = interpreter.release_global_lock() { ++ log::error!("Failed to release AML global lock: {:?}", error); ++ } + + result + .map_err(AmlEvalError::from) +@@ -649,11 +1122,15 @@ + } + } + ++ let (dmi_info, dmi_raw) = load_dmi_data(); ++ + let mut this = Self { + tables, + dsdt: None, + fadt: None, + shutdown_s5: RwLock::new(None), ++ dmi_info, ++ dmi_raw, + + // Temporary values + aml_symbols: RwLock::new(AmlSymbols::new(aml_bootstrap, ec)), +@@ -735,11 +1212,155 @@ + self.sdt_order.write().push(Some(*signature)); + } + +- pub fn aml_lookup(&self, symbol: &str) -> Option { +- if let Ok(aml_symbols) = self.aml_symbols(None) { ++ pub fn dmi_info(&self) -> Option<&DmiInfo> { ++ self.dmi_info.as_ref() ++ } ++ ++ pub fn dmi_raw(&self) -> Option<&[u8]> { ++ self.dmi_raw.as_deref() ++ } ++ ++ pub fn aml_lookup(&self, pci_fd: Option<&Fd>, symbol: &str) -> Option { ++ if let Ok(aml_symbols) = self.aml_symbols(pci_fd) { + aml_symbols.lookup(symbol) + } else { + None ++ } ++ } ++ ++ pub fn power_object_paths(&self, pci_fd: Option<&Fd>) -> Result { ++ let mut aml_symbols = self.aml_symbols.write(); ++ let aml_context = aml_symbols.aml_context_mut(pci_fd).map_err(|error| match error { ++ AmlEvalError::NotInitialized => PowerQueryError::Unavailable, ++ other => PowerQueryError::Aml(other), ++ })?; ++ ++ let mut symbol_names = Vec::with_capacity(256); ++ aml_context ++ .namespace ++ .lock() ++ .traverse(|level_aml_name, level| { ++ for (child_seg, _handle) in level.values.iter() { ++ if let Ok(aml_name) = ++ AmlName::from_name_seg(child_seg.to_owned()).resolve(level_aml_name) ++ { ++ symbol_names.push(aml_to_symbol(&aml_name)); ++ } ++ } ++ Ok(true) ++ }) ++ .map_err(AmlEvalError::from) ++ .map_err(PowerQueryError::Aml)?; ++ drop(aml_symbols); ++ ++ let mut adapter_paths = symbol_names ++ .iter() ++ .filter_map(|symbol| symbol_parent_path(symbol, "._PSR")) ++ .collect::>(); ++ adapter_paths.sort(); ++ adapter_paths.dedup(); ++ ++ let mut battery_paths = symbol_names ++ .iter() ++ .filter_map(|symbol| symbol_parent_path(symbol, "._BST")) ++ .collect::>(); ++ battery_paths.sort(); ++ battery_paths.dedup(); ++ ++ Ok(AcpiPowerDevicePaths { ++ adapters: adapter_paths, ++ batteries: battery_paths, ++ }) ++ } ++ ++ pub fn power_snapshot(&self, pci_fd: Option<&Fd>) -> Result { ++ let paths = self.power_object_paths(pci_fd)?; ++ if paths.adapters.is_empty() && paths.batteries.is_empty() { ++ return Err(PowerQueryError::Unsupported); ++ } ++ ++ let mut snapshot = AcpiPowerSnapshot::default(); ++ ++ for path in paths.adapters { ++ let method_name = AmlName::from_str(&format!("\\{}.{}", path, "_PSR")) ++ .map_err(|_| PowerQueryError::Aml(AmlEvalError::DeserializationError))?; ++ match self.aml_eval(pci_fd, method_name, Vec::new()) { ++ Ok(AmlSerdeValue::Integer(state)) => { ++ snapshot.adapters.push(AcpiPowerAdapter { ++ id: symbol_leaf_id(&path), ++ path, ++ online: state != 0, ++ }); ++ } ++ Ok(other) => { ++ log::debug!( ++ "Skipping AC adapter {} due to unexpected _PSR value: {:?}", ++ path, ++ other ++ ); ++ } ++ Err(error) => { ++ log::debug!("Skipping AC adapter {} due to _PSR eval failure: {:?}", path, error); ++ } ++ } ++ } ++ ++ for path in paths.batteries { ++ let mut battery = AcpiBattery { ++ id: symbol_leaf_id(&path), ++ path: path.clone(), ++ ..AcpiBattery::default() ++ }; ++ ++ match self.aml_eval( ++ pci_fd, ++ AmlName::from_str(&format!("\\{}.{}", path, "_BST")) ++ .map_err(|_| PowerQueryError::Aml(AmlEvalError::DeserializationError))?, ++ Vec::new(), ++ ) { ++ Ok(AmlSerdeValue::Package { contents }) => { ++ if let Err(error) = parse_bst_package(&contents, &mut battery) { ++ log::debug!("Skipping battery {} due to malformed _BST: {:?}", path, error); ++ continue; ++ } ++ } ++ Ok(other) => { ++ log::debug!("Skipping battery {} due to unexpected _BST value: {:?}", path, other); ++ continue; ++ } ++ Err(error) => { ++ log::debug!("Skipping battery {} due to _BST eval failure: {:?}", path, error); ++ continue; ++ } ++ } ++ ++ for method in ["_BIX", "_BIF"] { ++ let method_name = AmlName::from_str(&format!("\\{}.{}", path, method)) ++ .map_err(|_| PowerQueryError::Aml(AmlEvalError::DeserializationError))?; ++ match self.aml_eval(pci_fd, method_name, Vec::new()) { ++ Ok(AmlSerdeValue::Package { contents }) => { ++ let result = if method == "_BIX" { ++ fill_bix_fields(&contents, &mut battery) ++ } else { ++ fill_bif_fields(&contents, &mut battery) ++ }; ++ if result.is_ok() { ++ break; ++ } ++ } ++ Ok(_) => {} ++ Err(_) => {} ++ } ++ } ++ ++ battery.percentage = compute_battery_percentage(&battery); ++ snapshot.batteries.push(battery); ++ } ++ ++ if snapshot.adapters.is_empty() && snapshot.batteries.is_empty() { ++ Err(PowerQueryError::Unavailable) ++ } else { ++ Ok(snapshot) + } + } + +diff --git a/drivers/acpid/src/scheme.rs b/drivers/acpid/src/scheme.rs +--- a/drivers/acpid/src/scheme.rs ++++ b/drivers/acpid/src/scheme.rs +@@ -21,7 +21,10 @@ + use syscall::flag::{O_ACCMODE, O_DIRECTORY, O_RDONLY, O_STAT, O_SYMLINK}; + use syscall::{EOVERFLOW, EPERM}; + +-use crate::acpi::{AcpiContext, AmlSymbols, SdtSignature}; ++use crate::acpi::{ ++ AcpiBattery, AcpiContext, AcpiPowerAdapter, AcpiPowerSnapshot, AmlSymbols, DmiInfo, ++ PowerQueryError, SdtSignature, ++}; + + pub struct AcpiScheme<'acpi, 'sock> { + ctx: &'acpi AcpiContext, +@@ -41,8 +44,151 @@ + Table(SdtSignature), + Symbols(RwLockReadGuard<'a, AmlSymbols>), + Symbol { name: String, description: String }, ++ DmiDir, ++ Dmi(Vec), ++ PowerDir, ++ PowerAdaptersDir, ++ PowerAdapterDir(String), ++ PowerBatteriesDir, ++ PowerBatteryDir(String), ++ PowerFile(Vec), + SchemeRoot, + RegisterPci, ++} ++ ++const DMI_DIRECTORY_ENTRIES: &[&str] = &[ ++ "bios_vendor", ++ "bios_version", ++ "sys_vendor", ++ "board_vendor", ++ "board_name", ++ "board_version", ++ "product_name", ++ "product_version", ++ "raw", ++]; ++ ++const POWER_ROOT_ENTRIES: &[(&str, DirentKind)] = &[ ++ ("status", DirentKind::Regular), ++ ("adapter", DirentKind::Regular), ++ ("battery", DirentKind::Regular), ++ ("adapters", DirentKind::Directory), ++ ("batteries", DirentKind::Directory), ++]; ++ ++fn dmi_match_all_contents(dmi_info: &DmiInfo) -> Vec { ++ dmi_info.to_key_value_lines().into_bytes() ++} ++ ++fn dmi_contents(dmi_info: Option<&DmiInfo>, dmi_raw: Option<&[u8]>, name: &str) -> Option> { ++ Some(match name { ++ "raw" => dmi_raw?.to_vec(), ++ "" | "match_all" => dmi_match_all_contents(dmi_info?), ++ "bios_vendor" => dmi_info?.bios_vendor.clone()?.into_bytes(), ++ "bios_version" => dmi_info?.bios_version.clone()?.into_bytes(), ++ "sys_vendor" | "system_vendor" => dmi_info?.sys_vendor.clone()?.into_bytes(), ++ "board_vendor" => dmi_info?.board_vendor.clone()?.into_bytes(), ++ "board_name" => dmi_info?.board_name.clone()?.into_bytes(), ++ "board_version" => dmi_info?.board_version.clone()?.into_bytes(), ++ "product_name" => dmi_info?.product_name.clone()?.into_bytes(), ++ "product_version" => dmi_info?.product_version.clone()?.into_bytes(), ++ _ => return None, ++ }) ++} ++ ++fn text_file_bytes(value: &str) -> Vec { ++ format!("{value}\n").into_bytes() ++} ++ ++fn power_bool_bytes(value: bool) -> Vec { ++ text_file_bytes(if value { "1" } else { "0" }) ++} ++ ++fn power_u64_bytes(value: u64) -> Vec { ++ format!("{value}\n").into_bytes() ++} ++ ++fn power_f64_bytes(value: f64) -> Vec { ++ format!("{value}\n").into_bytes() ++} ++ ++fn power_adapter_file_contents(adapter: &AcpiPowerAdapter, name: &str) -> Option> { ++ Some(match name { ++ "path" => text_file_bytes(&adapter.path), ++ "online" => power_bool_bytes(adapter.online), ++ _ => return None, ++ }) ++} ++ ++fn power_adapter_entry_names() -> &'static [&'static str] { ++ &["path", "online"] ++} ++ ++fn power_battery_file_contents(battery: &AcpiBattery, name: &str) -> Option> { ++ Some(match name { ++ "path" => text_file_bytes(&battery.path), ++ "state" => power_u64_bytes(battery.state), ++ "present_rate" => power_u64_bytes(battery.present_rate?), ++ "remaining_capacity" => power_u64_bytes(battery.remaining_capacity?), ++ "present_voltage" => power_u64_bytes(battery.present_voltage?), ++ "power_unit" => text_file_bytes(battery.power_unit.as_deref()?), ++ "design_capacity" => power_u64_bytes(battery.design_capacity?), ++ "last_full_capacity" => power_u64_bytes(battery.last_full_capacity?), ++ "design_voltage" => power_u64_bytes(battery.design_voltage?), ++ "technology" => text_file_bytes(battery.technology.as_deref()?), ++ "model" => text_file_bytes(battery.model.as_deref()?), ++ "serial" => text_file_bytes(battery.serial.as_deref()?), ++ "battery_type" => text_file_bytes(battery.battery_type.as_deref()?), ++ "oem_info" => text_file_bytes(battery.oem_info.as_deref()?), ++ "percentage" => power_f64_bytes(battery.percentage?), ++ _ => return None, ++ }) ++} ++ ++fn power_battery_entry_names(battery: &AcpiBattery) -> Vec<&'static str> { ++ let mut names = vec!["path", "state"]; ++ ++ if battery.present_rate.is_some() { ++ names.push("present_rate"); ++ } ++ if battery.remaining_capacity.is_some() { ++ names.push("remaining_capacity"); ++ } ++ if battery.present_voltage.is_some() { ++ names.push("present_voltage"); ++ } ++ if battery.power_unit.is_some() { ++ names.push("power_unit"); ++ } ++ if battery.design_capacity.is_some() { ++ names.push("design_capacity"); ++ } ++ if battery.last_full_capacity.is_some() { ++ names.push("last_full_capacity"); ++ } ++ if battery.design_voltage.is_some() { ++ names.push("design_voltage"); ++ } ++ if battery.technology.is_some() { ++ names.push("technology"); ++ } ++ if battery.model.is_some() { ++ names.push("model"); ++ } ++ if battery.serial.is_some() { ++ names.push("serial"); ++ } ++ if battery.battery_type.is_some() { ++ names.push("battery_type"); ++ } ++ if battery.oem_info.is_some() { ++ names.push("oem_info"); ++ } ++ if battery.percentage.is_some() { ++ names.push("percentage"); ++ } ++ ++ names + } + + impl HandleKind<'_> { +@@ -53,6 +199,14 @@ + Self::Table(_) => false, + Self::Symbols(_) => true, + Self::Symbol { .. } => false, ++ Self::DmiDir => true, ++ Self::Dmi(_) => false, ++ Self::PowerDir => true, ++ Self::PowerAdaptersDir => true, ++ Self::PowerAdapterDir(_) => true, ++ Self::PowerBatteriesDir => true, ++ Self::PowerBatteryDir(_) => true, ++ Self::PowerFile(_) => false, + Self::SchemeRoot => false, + Self::RegisterPci => false, + } +@@ -65,8 +219,18 @@ + .ok_or(Error::new(EBADFD))? + .length(), + Self::Symbol { description, .. } => description.len(), ++ Self::Dmi(contents) => contents.len(), ++ Self::PowerFile(contents) => contents.len(), + // Directories +- Self::TopLevel | Self::Symbols(_) | Self::Tables => 0, ++ Self::TopLevel ++ | Self::Symbols(_) ++ | Self::Tables ++ | Self::DmiDir ++ | Self::PowerDir ++ | Self::PowerAdaptersDir ++ | Self::PowerAdapterDir(_) ++ | Self::PowerBatteriesDir ++ | Self::PowerBatteryDir(_) => 0, + Self::SchemeRoot | Self::RegisterPci => return Err(Error::new(EBADF)), + }) + } +@@ -79,6 +243,154 @@ + handles: HandleMap::new(), + pci_fd: None, + socket, ++ } ++ } ++ ++ fn power_snapshot(&self) -> Result { ++ self.ctx ++ .power_snapshot(self.pci_fd.as_ref()) ++ .map_err(|error| match error { ++ PowerQueryError::Unavailable | PowerQueryError::Unsupported => Error::new(ENOENT), ++ PowerQueryError::Aml(other) => { ++ log::warn!("Failed to build ACPI power snapshot: {:?}", other); ++ Error::new(EIO) ++ } ++ }) ++ } ++ ++ fn power_surface_counts(&self) -> (bool, usize, usize) { ++ let Ok(paths) = self.ctx.power_object_paths(self.pci_fd.as_ref()) else { ++ return (false, 0, 0); ++ }; ++ ++ ( ++ self.ctx.power_snapshot(self.pci_fd.as_ref()).is_ok(), ++ paths.batteries.len(), ++ paths.adapters.len(), ++ ) ++ } ++ ++ fn power_status_contents(&self) -> Vec { ++ let (available, battery_count, adapter_count) = self.power_surface_counts(); ++ format!( ++ "{{\"available\": {}, \"battery_count\": {}, \"adapter_count\": {}}}\n", ++ available, battery_count, adapter_count ++ ) ++ .into_bytes() ++ } ++ ++ fn power_adapter_summary_contents(&self) -> Vec { ++ let Ok(paths) = self.ctx.power_object_paths(self.pci_fd.as_ref()) else { ++ return text_file_bytes("unavailable"); ++ }; ++ if paths.adapters.is_empty() { ++ return text_file_bytes("unsupported"); ++ } ++ ++ match self.ctx.power_snapshot(self.pci_fd.as_ref()) { ++ Ok(snapshot) => text_file_bytes(snapshot.adapter_status()), ++ Err(_) => text_file_bytes("unavailable"), ++ } ++ } ++ ++ fn power_battery_summary_contents(&self) -> Vec { ++ let Ok(paths) = self.ctx.power_object_paths(self.pci_fd.as_ref()) else { ++ return text_file_bytes("unavailable"); ++ }; ++ if paths.batteries.is_empty() { ++ return text_file_bytes("unsupported"); ++ } ++ ++ match self.ctx.power_snapshot(self.pci_fd.as_ref()) { ++ Ok(snapshot) => text_file_bytes(snapshot.battery_status()), ++ Err(_) => text_file_bytes("unavailable"), ++ } ++ } ++ ++ fn power_handle(&self, path: &str) -> Result> { ++ let normalized = path.trim_matches('/'); ++ ++ if normalized.is_empty() { ++ return Ok(HandleKind::PowerDir); ++ } ++ if normalized == "status" { ++ return Ok(HandleKind::PowerFile(self.power_status_contents())); ++ } ++ if normalized == "adapter" { ++ return Ok(HandleKind::PowerFile(self.power_adapter_summary_contents())); ++ } ++ if normalized == "battery" { ++ return Ok(HandleKind::PowerFile(self.power_battery_summary_contents())); ++ } ++ if normalized == "adapters" { ++ return Ok(HandleKind::PowerAdaptersDir); ++ } ++ if let Some(rest) = normalized.strip_prefix("adapters/") { ++ return self.power_adapter_handle(rest); ++ } ++ if normalized == "batteries" { ++ return Ok(HandleKind::PowerBatteriesDir); ++ } ++ if let Some(rest) = normalized.strip_prefix("batteries/") { ++ return self.power_battery_handle(rest); ++ } ++ ++ Err(Error::new(ENOENT)) ++ } ++ ++ fn power_adapter_handle(&self, path: &str) -> Result> { ++ let normalized = path.trim_matches('/'); ++ if normalized.is_empty() { ++ return Ok(HandleKind::PowerAdaptersDir); ++ } ++ ++ let mut parts = normalized.split('/'); ++ let adapter_id = parts.next().ok_or(Error::new(ENOENT))?; ++ let field = parts.next(); ++ if parts.next().is_some() { ++ return Err(Error::new(ENOENT)); ++ } ++ ++ let snapshot = self.power_snapshot()?; ++ let adapter = snapshot ++ .adapters ++ .iter() ++ .find(|adapter| adapter.id == adapter_id) ++ .ok_or(Error::new(ENOENT))?; ++ ++ match field { ++ None | Some("") => Ok(HandleKind::PowerAdapterDir(adapter.id.clone())), ++ Some(name) => Ok(HandleKind::PowerFile( ++ power_adapter_file_contents(adapter, name).ok_or(Error::new(ENOENT))?, ++ )), ++ } ++ } ++ ++ fn power_battery_handle(&self, path: &str) -> Result> { ++ let normalized = path.trim_matches('/'); ++ if normalized.is_empty() { ++ return Ok(HandleKind::PowerBatteriesDir); ++ } ++ ++ let mut parts = normalized.split('/'); ++ let battery_id = parts.next().ok_or(Error::new(ENOENT))?; ++ let field = parts.next(); ++ if parts.next().is_some() { ++ return Err(Error::new(ENOENT)); ++ } ++ ++ let snapshot = self.power_snapshot()?; ++ let battery = snapshot ++ .batteries ++ .iter() ++ .find(|battery| battery.id == battery_id) ++ .ok_or(Error::new(ENOENT))?; ++ ++ match field { ++ None | Some("") => Ok(HandleKind::PowerBatteryDir(battery.id.clone())), ++ Some(name) => Ok(HandleKind::PowerFile( ++ power_battery_file_contents(battery, name).ok_or(Error::new(ENOENT))?, ++ )), + } + } + } +@@ -184,9 +496,9 @@ + HandleKind::SchemeRoot => { + // TODO: arrayvec + let components = { +- let mut v = arrayvec::ArrayVec::<&str, 3>::new(); ++ let mut v = arrayvec::ArrayVec::<&str, 4>::new(); + let it = path.split('/'); +- for component in it.take(3) { ++ for component in it.take(4) { + v.push(component); + } + +@@ -195,6 +507,25 @@ + + match &*components { + [""] => HandleKind::TopLevel, ++ ["dmi"] => { ++ if flag_dir || flag_stat || path.ends_with('/') { ++ HandleKind::DmiDir ++ } else { ++ HandleKind::Dmi( ++ dmi_contents(self.ctx.dmi_info(), self.ctx.dmi_raw(), "") ++ .ok_or(Error::new(ENOENT))?, ++ ) ++ } ++ } ++ ["dmi", ""] => HandleKind::DmiDir, ++ ["dmi", field] => HandleKind::Dmi( ++ dmi_contents(self.ctx.dmi_info(), self.ctx.dmi_raw(), field) ++ .ok_or(Error::new(ENOENT))?, ++ ), ++ ["power"] => HandleKind::PowerDir, ++ ["power", tail] => self.power_handle(tail)?, ++ ["power", a, b] => self.power_handle(&format!("{a}/{b}"))?, ++ ["power", a, b, c] => self.power_handle(&format!("{a}/{b}/{c}"))?, + ["register_pci"] => HandleKind::RegisterPci, + ["tables"] => HandleKind::Tables, + +@@ -212,7 +543,7 @@ + } + + ["symbols", symbol] => { +- if let Some(description) = self.ctx.aml_lookup(symbol) { ++ if let Some(description) = self.ctx.aml_lookup(self.pci_fd.as_ref(), symbol) { + HandleKind::Symbol { + name: (*symbol).to_owned(), + description, +@@ -225,6 +556,16 @@ + _ => return Err(Error::new(ENOENT)), + } + } ++ HandleKind::DmiDir => { ++ if path.is_empty() { ++ HandleKind::DmiDir ++ } else { ++ HandleKind::Dmi( ++ dmi_contents(self.ctx.dmi_info(), self.ctx.dmi_raw(), path) ++ .ok_or(Error::new(ENOENT))?, ++ ) ++ } ++ } + HandleKind::Symbols(ref aml_symbols) => { + if let Some(description) = aml_symbols.lookup(path) { + HandleKind::Symbol { +@@ -233,6 +574,23 @@ + } + } else { + return Err(Error::new(ENOENT)); ++ } ++ } ++ HandleKind::PowerDir => self.power_handle(path)?, ++ HandleKind::PowerAdaptersDir => self.power_adapter_handle(path)?, ++ HandleKind::PowerAdapterDir(ref adapter_id) => { ++ if path.is_empty() { ++ HandleKind::PowerAdapterDir(adapter_id.clone()) ++ } else { ++ self.power_adapter_handle(&format!("{adapter_id}/{path}"))? ++ } ++ } ++ HandleKind::PowerBatteriesDir => self.power_battery_handle(path)?, ++ HandleKind::PowerBatteryDir(ref battery_id) => { ++ if path.is_empty() { ++ HandleKind::PowerBatteryDir(battery_id.clone()) ++ } else { ++ self.power_battery_handle(&format!("{battery_id}/{path}"))? + } + } + _ => return Err(Error::new(EACCES)), +@@ -309,6 +667,8 @@ + .ok_or(Error::new(EBADFD))? + .as_slice(), + HandleKind::Symbol { description, .. } => description.as_bytes(), ++ HandleKind::Dmi(contents) => contents.as_slice(), ++ HandleKind::PowerFile(contents) => contents.as_slice(), + _ => return Err(Error::new(EINVAL)), + }; + +@@ -328,13 +688,18 @@ + mut buf: DirentBuf<&'buf mut [u8]>, + opaque_offset: u64, + ) -> Result> { +- let handle = self.handles.get_mut(id)?; ++ let handle = self.handles.get(id)?; + + match &handle.kind { + HandleKind::TopLevel => { +- const TOPLEVEL_ENTRIES: &[&str] = &["tables", "symbols"]; +- +- for (idx, name) in TOPLEVEL_ENTRIES ++ const TOPLEVEL_ENTRIES: &[(&str, DirentKind)] = &[ ++ ("tables", DirentKind::Directory), ++ ("symbols", DirentKind::Directory), ++ ("dmi", DirentKind::Directory), ++ ("power", DirentKind::Directory), ++ ]; ++ ++ for (idx, (name, kind)) in TOPLEVEL_ENTRIES + .iter() + .enumerate() + .skip(opaque_offset as usize) +@@ -343,7 +708,106 @@ + inode: 0, + next_opaque_id: idx as u64 + 1, + name, ++ kind: *kind, ++ })?; ++ } ++ } ++ HandleKind::DmiDir => { ++ for (idx, name) in DMI_DIRECTORY_ENTRIES ++ .iter() ++ .enumerate() ++ .skip(opaque_offset as usize) ++ { ++ buf.entry(DirEntry { ++ inode: 0, ++ next_opaque_id: idx as u64 + 1, ++ name, ++ kind: DirentKind::Regular, ++ })?; ++ } ++ } ++ HandleKind::PowerDir => { ++ for (idx, (name, kind)) in POWER_ROOT_ENTRIES ++ .iter() ++ .enumerate() ++ .skip(opaque_offset as usize) ++ { ++ buf.entry(DirEntry { ++ inode: 0, ++ next_opaque_id: idx as u64 + 1, ++ name, ++ kind: *kind, ++ })?; ++ } ++ } ++ HandleKind::PowerAdaptersDir => { ++ let snapshot = self.power_snapshot()?; ++ for (idx, adapter) in snapshot ++ .adapters ++ .iter() ++ .enumerate() ++ .skip(opaque_offset as usize) ++ { ++ buf.entry(DirEntry { ++ inode: 0, ++ next_opaque_id: idx as u64 + 1, ++ name: adapter.id.as_str(), + kind: DirentKind::Directory, ++ })?; ++ } ++ } ++ HandleKind::PowerAdapterDir(adapter_id) => { ++ let snapshot = self.power_snapshot()?; ++ let _adapter = snapshot ++ .adapters ++ .iter() ++ .find(|adapter| adapter.id == *adapter_id) ++ .ok_or(Error::new(EIO))?; ++ ++ for (idx, name) in power_adapter_entry_names() ++ .iter() ++ .enumerate() ++ .skip(opaque_offset as usize) ++ { ++ buf.entry(DirEntry { ++ inode: 0, ++ next_opaque_id: idx as u64 + 1, ++ name, ++ kind: DirentKind::Regular, ++ })?; ++ } ++ } ++ HandleKind::PowerBatteriesDir => { ++ let snapshot = self.power_snapshot()?; ++ for (idx, battery) in snapshot ++ .batteries ++ .iter() ++ .enumerate() ++ .skip(opaque_offset as usize) ++ { ++ buf.entry(DirEntry { ++ inode: 0, ++ next_opaque_id: idx as u64 + 1, ++ name: battery.id.as_str(), ++ kind: DirentKind::Directory, ++ })?; ++ } ++ } ++ HandleKind::PowerBatteryDir(battery_id) => { ++ let snapshot = self.power_snapshot()?; ++ let battery = snapshot ++ .batteries ++ .iter() ++ .find(|battery| battery.id == *battery_id) ++ .ok_or(Error::new(EIO))?; ++ let entry_names = power_battery_entry_names(battery); ++ ++ for (idx, name) in entry_names.iter().enumerate().skip(opaque_offset as usize) { ++ buf.entry(DirEntry { ++ inode: 0, ++ next_opaque_id: idx as u64 + 1, ++ name, ++ kind: DirentKind::Regular, + })?; + } + } +@@ -419,11 +883,11 @@ + }; + + let Ok(aml_name) = AmlName::from_str(&to_aml_format(name)) else { +- log::error!("Failed to convert symbol name: "{name}" to aml name!"); ++ log::error!("Failed to convert symbol name: \"{name}\" to aml name!"); + return Err(Error::new(EBADF)); + }; + +- let Ok(result) = self.ctx.aml_eval(aml_name, args) else { ++ let Ok(result) = self.ctx.aml_eval(self.pci_fd.as_ref(), aml_name, args) else { + return Err(Error::new(EINVAL)); + }; + diff --git a/local/patches/base/P3-acpi-wave12-hardening.patch b/local/patches/base/P3-acpi-wave12-hardening.patch new file mode 100644 index 00000000..b80c197a --- /dev/null +++ b/local/patches/base/P3-acpi-wave12-hardening.patch @@ -0,0 +1,844 @@ +diff --git a/drivers/acpid/src/acpi.rs b/drivers/acpid/src/acpi.rs +index 94a1eb17..c8919290 100644 +--- a/drivers/acpid/src/acpi.rs ++++ b/drivers/acpid/src/acpi.rs +@@ -52,9 +52,7 @@ impl SdtHeader { + } + } + pub fn length(&self) -> usize { +- self.length +- .try_into() +- .expect("expected usize to be at least 32 bits") ++ self.length as usize + } + } + +@@ -132,6 +130,9 @@ impl Drop for PhysmapGuard { + pub struct Sdt(Arc<[u8]>); + + impl Sdt { ++ // SDT validation is split between parser and caller policy: ++ // - this parser only decides whether a given byte slice is structurally valid, ++ // - callers decide whether rejection is fatal (root [R|X]SDT) or degradable (child tables). + pub fn new(slice: Arc<[u8]>) -> Result { + let header = match plain::from_bytes::(&slice) { + Ok(header) => header, +@@ -233,6 +234,177 @@ impl fmt::Debug for Sdt { + pub struct Dsdt(Sdt); + pub struct Ssdt(Sdt); + ++#[derive(Clone, Copy, Debug)] ++pub enum AmlBootstrapMethod { ++ HwdEnv, ++ X86BiosFallback, ++} ++impl AmlBootstrapMethod { ++ fn as_str(self) -> &'static str { ++ match self { ++ Self::HwdEnv => "hwd RSDP_ADDR/RSDP_SIZE handoff", ++ Self::X86BiosFallback => "x86 BIOS fallback", ++ } ++ } ++} ++ ++#[derive(Clone, Debug)] ++pub struct AmlBootstrap { ++ rsdp_addr: usize, ++ rsdp_size: Option, ++ method: AmlBootstrapMethod, ++} ++impl AmlBootstrap { ++ pub fn from_env() -> Result> { ++ let rsdp_addr = usize::from_str_radix(&std::env::var("RSDP_ADDR")?, 16)?; ++ let rsdp_size = match std::env::var("RSDP_SIZE") { ++ Ok(size) => Some(usize::from_str_radix(&size, 16)?), ++ Err(std::env::VarError::NotPresent) => None, ++ Err(err) => return Err(Box::new(err)), ++ }; ++ ++ Ok(Self { ++ rsdp_addr, ++ rsdp_size, ++ method: AmlBootstrapMethod::HwdEnv, ++ }) ++ } ++ ++ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] ++ pub fn x86_bios_fallback() -> Result, Box> { ++ if let Some(rsdp_addr) = search_x86_bios_rsdp()? { ++ return Ok(Some(Self { ++ rsdp_addr, ++ rsdp_size: None, ++ method: AmlBootstrapMethod::X86BiosFallback, ++ })); ++ } ++ ++ Ok(None) ++ } ++ ++ #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] ++ pub fn x86_bios_fallback() -> Result, Box> { ++ Ok(None) ++ } ++ ++ pub fn log_bootstrap(&self) { ++ log::info!( ++ "acpid: AML bootstrap via {} (RSDP at {:#X})", ++ self.method.as_str(), ++ self.rsdp_addr ++ ); ++ ++ if let Some(rsdp_size) = self.rsdp_size { ++ log::debug!("acpid: AML bootstrap RSDP_SIZE={:#X}", rsdp_size); ++ } ++ } ++} ++ ++#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] ++const RSDP_SIGNATURE: &[u8; 8] = b"RSD PTR "; ++ ++#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] ++fn search_x86_bios_rsdp() -> Result, Box> { ++ let ebda_segment = read_u16_physical(0x40E)?; ++ let ebda_addr = usize::from(ebda_segment) << 4; ++ ++ if ebda_addr != 0 { ++ if let Some(rsdp_addr) = search_rsdp_region(ebda_addr, 1024)? { ++ return Ok(Some(rsdp_addr)); ++ } ++ } ++ ++ search_rsdp_region(0xE0000, 0x20000).map_err(Into::into) ++} ++ ++#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] ++fn read_u16_physical(physaddr: usize) -> std::io::Result { ++ let start_page = physaddr / PAGE_SIZE * PAGE_SIZE; ++ let page_offset = physaddr % PAGE_SIZE; ++ let map = PhysmapGuard::map(start_page, 1)?; ++ let bytes = map ++ .get(page_offset..page_offset + mem::size_of::()) ++ .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::UnexpectedEof, "short BIOS map"))?; ++ ++ Ok(u16::from_le_bytes([bytes[0], bytes[1]])) ++} ++ ++#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] ++fn search_rsdp_region(physaddr: usize, length: usize) -> std::io::Result> { ++ let start_page = physaddr / PAGE_SIZE * PAGE_SIZE; ++ let page_offset = physaddr % PAGE_SIZE; ++ let mapped_len = page_offset + length; ++ let page_count = mapped_len.div_ceil(PAGE_SIZE); ++ let map = PhysmapGuard::map(start_page, page_count)?; ++ let region = map.get(page_offset..page_offset + length).ok_or_else(|| { ++ std::io::Error::new(std::io::ErrorKind::UnexpectedEof, "short BIOS RSDP search window") ++ })?; ++ ++ for candidate_offset in (0..=length.saturating_sub(20)).step_by(16) { ++ if region ++ .get(candidate_offset..candidate_offset + RSDP_SIGNATURE.len()) ++ != Some(&RSDP_SIGNATURE[..]) ++ { ++ continue; ++ } ++ ++ if rsdp_candidate_valid(®ion[candidate_offset..]) { ++ return Ok(Some(physaddr + candidate_offset)); ++ } ++ } ++ ++ Ok(None) ++} ++ ++#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] ++fn rsdp_candidate_valid(candidate: &[u8]) -> bool { ++ if candidate.len() < 20 || &candidate[..RSDP_SIGNATURE.len()] != RSDP_SIGNATURE { ++ return false; ++ } ++ ++ if checksum_is_zero(&candidate[..20]).is_err() { ++ return false; ++ } ++ ++ let revision = candidate[15]; ++ if revision < 2 { ++ return true; ++ } ++ ++ if candidate.len() < 36 { ++ return false; ++ } ++ ++ let declared_length = u32::from_le_bytes([candidate[20], candidate[21], candidate[22], candidate[23]]) ++ as usize; ++ if declared_length < 36 || candidate.len() < declared_length { ++ return false; ++ } ++ ++ checksum_is_zero(&candidate[..declared_length]).is_ok() ++} ++ ++#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] ++fn checksum_is_zero(bytes: &[u8]) -> Result<(), ()> { ++ let checksum = bytes ++ .iter() ++ .copied() ++ .fold(0_u8, |current_sum, item| current_sum.wrapping_add(item)); ++ ++ if checksum == 0 { ++ Ok(()) ++ } else { ++ Err(()) ++ } ++} ++ ++#[derive(Clone, Copy, Debug)] ++struct SleepTypeData { ++ slp_typa: u16, ++ slp_typb: u16, ++} ++ + // Current AML implementation builds the aml_context.namespace at startup, + // but the cache for symbols is lazy-loaded when someone + // reads from the acpi:/symbols scheme. +@@ -245,15 +417,20 @@ pub struct AmlSymbols { + symbol_cache: FxHashMap, + page_cache: Arc>, + aml_region_handlers: Vec<(RegionSpace, Box)>, ++ aml_bootstrap: Option, + } + + impl AmlSymbols { +- pub fn new(aml_region_handlers: Vec<(RegionSpace, Box)>) -> Self { ++ pub fn new( ++ aml_bootstrap: Option, ++ aml_region_handlers: Vec<(RegionSpace, Box)>, ++ ) -> Self { + Self { + aml_context: None, + symbol_cache: FxHashMap::default(), + page_cache: Arc::new(Mutex::new(AmlPageCache::default())), + aml_region_handlers, ++ aml_bootstrap, + } + } + +@@ -264,9 +441,12 @@ impl AmlSymbols { + let format_err = |err| format!("{:?}", err); + let handler = AmlPhysMemHandler::new(pci_fd, Arc::clone(&self.page_cache)); + //TODO: use these parsed tables for the rest of acpid +- let rsdp_address = usize::from_str_radix(&std::env::var("RSDP_ADDR")?, 16)?; ++ let bootstrap = self ++ .aml_bootstrap ++ .as_ref() ++ .ok_or_else(|| std::io::Error::other("AML bootstrap unavailable"))?; + let tables = +- unsafe { AcpiTables::from_rsdp(handler.clone(), rsdp_address).map_err(format_err)? }; ++ unsafe { AcpiTables::from_rsdp(handler.clone(), bootstrap.rsdp_addr).map_err(format_err)? }; + let platform = AcpiPlatform::new(tables, handler).map_err(format_err)?; + let interpreter = Interpreter::new_from_platform(&platform).map_err(format_err)?; + for (region, handler) in self.aml_region_handlers.drain(..) { +@@ -316,7 +496,7 @@ impl AmlSymbols { + .namespace + .lock() + .traverse(|level_aml_name, level| { +- for (child_seg, handle) in level.values.iter() { ++ for (child_seg, _handle) in level.values.iter() { + if let Ok(aml_name) = + AmlName::from_name_seg(child_seg.to_owned()).resolve(level_aml_name) + { +@@ -379,6 +559,7 @@ pub struct AcpiContext { + tables: Vec, + dsdt: Option, + fadt: Option, ++ shutdown_s5: RwLock>, + + aml_symbols: RwLock, + +@@ -426,27 +607,56 @@ impl AcpiContext { + + pub fn init( + rxsdt_physaddrs: impl Iterator, ++ aml_bootstrap: Option, + ec: Vec<(RegionSpace, Box)>, + ) -> Self { +- let tables = rxsdt_physaddrs +- .map(|physaddr| { +- let physaddr: usize = physaddr +- .try_into() +- .expect("expected ACPI addresses to be compatible with the current word size"); +- +- log::trace!("TABLE AT {:#>08X}", physaddr); +- +- Sdt::load_from_physical(physaddr).expect("failed to load physical SDT") +- }) +- .collect::>(); ++ // Child-table validation policy: ++ // - checksum/length failures are degradable: warn, skip the table, continue boot, ++ // - malformed FADT is handled separately as "raw-table-only" mode for ACPI control paths, ++ // - MADT subtable interpretation is delegated to consumers, which must skip unknown entry ++ // types instead of treating them as daemon-fatal. ++ let mut tables = Vec::new(); ++ for physaddr in rxsdt_physaddrs { ++ let physaddr: usize = match physaddr.try_into() { ++ Ok(physaddr) => physaddr, ++ Err(_) => { ++ log::warn!( ++ "acpid: skipping ACPI table at {:#X}: physical address out of range", ++ physaddr ++ ); ++ continue; ++ } ++ }; ++ ++ match Sdt::load_from_physical(physaddr) { ++ Ok(table) => { ++ log::debug!( ++ "acpid: accepted ACPI table {} at {:#X}", ++ String::from_utf8_lossy(&table.signature), ++ physaddr ++ ); ++ tables.push(table); ++ } ++ Err(TablePhysLoadError::Validity(InvalidSdtError::BadChecksum)) => { ++ log::warn!( ++ "acpid: skipping ACPI table at {:#X}: checksum validation failed", ++ physaddr ++ ); ++ } ++ Err(err) => { ++ log::warn!("acpid: skipping ACPI table at {:#X}: {}", physaddr, err); ++ } ++ } ++ } + + let mut this = Self { + tables, + dsdt: None, + fadt: None, ++ shutdown_s5: RwLock::new(None), + + // Temporary values +- aml_symbols: RwLock::new(AmlSymbols::new(ec)), ++ aml_symbols: RwLock::new(AmlSymbols::new(aml_bootstrap, ec)), + + next_ctx: RwLock::new(0), + +@@ -581,55 +791,26 @@ impl AcpiContext { + let port = fadt.pm1a_control_block as u16; + let mut val = 1 << 13; + +- let aml_symbols = self.aml_symbols.read(); +- +- let s5_aml_name = match acpi::aml::namespace::AmlName::from_str("\\_S5") { +- Ok(aml_name) => aml_name, +- Err(error) => { +- log::error!("Could not build AmlName for \\_S5, {:?}", error); +- return; +- } +- }; +- +- let s5 = match &aml_symbols.aml_context { +- Some(aml_context) => match aml_context.namespace.lock().get(s5_aml_name) { +- Ok(s5) => s5, +- Err(error) => { +- log::error!("Cannot set S-state, missing \\_S5, {:?}", error); +- return; ++ if self.shutdown_s5.read().is_none() { ++ match self.cache_shutdown_s5_from_ready_aml("existing AML context") { ++ Ok(true) | Ok(false) => {} ++ Err(err) => { ++ log::warn!("acpid: _S5 was not ready at shutdown: {}", err); + } +- }, +- None => { +- log::error!("Cannot set S-state, AML context not initialized"); +- return; + } +- }; +- +- let package = match s5.deref() { +- acpi::aml::object::Object::Package(package) => package, +- _ => { +- log::error!("Cannot set S-state, \\_S5 is not a package"); +- return; +- } +- }; ++ } + +- let slp_typa = match package[0].deref() { +- acpi::aml::object::Object::Integer(i) => i.to_owned(), +- _ => { +- log::error!("typa is not an Integer"); +- return; +- } +- }; +- let slp_typb = match package[1].deref() { +- acpi::aml::object::Object::Integer(i) => i.to_owned(), +- _ => { +- log::error!("typb is not an Integer"); +- return; +- } ++ let Some(sleep_types) = *self.shutdown_s5.read() else { ++ log::error!("Cannot set S-state, missing derived \\_S5 sleep types"); ++ return; + }; + +- log::trace!("Shutdown SLP_TYPa {:X}, SLP_TYPb {:X}", slp_typa, slp_typb); +- val |= slp_typa as u16; ++ log::trace!( ++ "Shutdown SLP_TYPa {:X}, SLP_TYPb {:X}", ++ sleep_types.slp_typa, ++ sleep_types.slp_typb ++ ); ++ val |= sleep_types.slp_typa; + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { +@@ -652,6 +833,86 @@ impl AcpiContext { + core::hint::spin_loop(); + } + } ++ ++ pub fn prime_shutdown_s5(&self, pci_fd: Option<&libredox::Fd>, source: &'static str) { ++ match self.cache_shutdown_s5(pci_fd, source) { ++ Ok(()) => {} ++ Err(err) => { ++ log::warn!("acpid: unable to derive _S5 from {}: {}", source, err); ++ } ++ } ++ } ++ ++ fn cache_shutdown_s5( ++ &self, ++ pci_fd: Option<&libredox::Fd>, ++ source: &'static str, ++ ) -> Result<(), String> { ++ if self.shutdown_s5.read().is_some() { ++ return Ok(()); ++ } ++ ++ let mut aml_symbols = self.aml_symbols.write(); ++ let aml_context = aml_symbols ++ .aml_context_mut(pci_fd) ++ .map_err(|err| format!("AML not ready: {err}"))?; ++ let sleep_types = extract_s5_sleep_types(aml_context)?; ++ ++ *self.shutdown_s5.write() = Some(sleep_types); ++ log::info!("acpid: _S5 derived from {}", source); ++ Ok(()) ++ } ++ ++ fn cache_shutdown_s5_from_ready_aml(&self, source: &'static str) -> Result { ++ if self.shutdown_s5.read().is_some() { ++ return Ok(true); ++ } ++ ++ let aml_symbols = self.aml_symbols.read(); ++ let Some(aml_context) = aml_symbols.aml_context.as_ref() else { ++ return Ok(false); ++ }; ++ ++ let sleep_types = extract_s5_sleep_types(aml_context)?; ++ drop(aml_symbols); ++ ++ *self.shutdown_s5.write() = Some(sleep_types); ++ log::info!("acpid: _S5 derived from {}", source); ++ Ok(true) ++ } ++} ++ ++fn extract_s5_sleep_types( ++ aml_context: &Interpreter, ++) -> Result { ++ let s5_aml_name = acpi::aml::namespace::AmlName::from_str("\\_S5") ++ .map_err(|error| format!("failed to build \\_S5 name: {error:?}"))?; ++ let s5 = aml_context ++ .namespace ++ .lock() ++ .get(s5_aml_name) ++ .map_err(|error| format!("missing \\_S5: {error:?}"))?; ++ let package = match s5.deref() { ++ acpi::aml::object::Object::Package(package) => package, ++ _ => return Err("\\_S5 is not a package".into()), ++ }; ++ ++ let slp_typa = extract_sleep_type(package.get(0), "SLP_TYPa")?; ++ let slp_typb = extract_sleep_type(package.get(1), "SLP_TYPb")?; ++ ++ Ok(SleepTypeData { slp_typa, slp_typb }) ++} ++ ++fn extract_sleep_type(value: Option<&WrappedObject>, label: &'static str) -> Result { ++ let Some(value) = value else { ++ return Err(format!("missing {label} in \\_S5 package")); ++ }; ++ ++ match value.deref() { ++ acpi::aml::object::Object::Integer(i) => u16::try_from(*i) ++ .map_err(|_| format!("{label} out of range for PM1 control register")), ++ _ => Err(format!("{label} is not an Integer")), ++ } + } + + #[repr(C, packed)] +@@ -760,45 +1021,66 @@ impl Deref for Fadt { + type Target = FadtStruct; + + fn deref(&self) -> &Self::Target { +- plain::from_bytes::(&self.0 .0) +- .expect("expected FADT struct to already be validated in Deref impl") ++ match plain::from_bytes::(&self.0 .0) { ++ Ok(fadt) => fadt, ++ Err(plain::Error::TooShort) => unreachable!( ++ "Fadt::new validates the minimum FADT size before constructing Fadt" ++ ), ++ Err(plain::Error::BadAlignment) => unreachable!( ++ "plain::from_bytes reported bad alignment, but FadtStruct is #[repr(packed)]" ++ ), ++ } + } + } + + impl Fadt { + pub fn new(sdt: Sdt) -> Option { +- if sdt.signature != *b"FACP" || sdt.length() < mem::size_of::() { ++ if sdt.signature != *b"FACP" || sdt.length() < mem::size_of::() { + return None; + } + Some(Fadt(sdt)) + } + + pub fn init(context: &mut AcpiContext) { +- let fadt_sdt = context +- .take_single_sdt(*b"FACP") +- .expect("expected ACPI to always have a FADT"); ++ // FADT policy: this table is mandatory for ACPI control services such as shutdown/reboot. ++ // If it is missing or malformed, acpid stays alive for diagnostics/raw tables but degrades ++ // into raw-table-only mode instead of crashing the boot. ++ let Some(fadt_sdt) = context.take_single_sdt(*b"FACP") else { ++ log::error!("acpid: missing FADT; booting without ACPI control services"); ++ return; ++ }; + + let fadt = match Fadt::new(fadt_sdt) { + Some(fadt) => fadt, + None => { +- log::error!("Failed to find FADT"); ++ log::error!("acpid: corrupt FADT; booting without ACPI control services"); + return; + } + }; + + let dsdt_ptr = match fadt.acpi_2_struct() { +- Some(fadt2) => usize::try_from(fadt2.x_dsdt).unwrap_or_else(|_| { +- usize::try_from(fadt.dsdt).expect("expected any given u32 to fit within usize") +- }), +- None => usize::try_from(fadt.dsdt).expect("expected any given u32 to fit within usize"), ++ Some(fadt2) if fadt2.x_dsdt != 0 => match usize::try_from(fadt2.x_dsdt) { ++ Ok(dsdt_ptr) => dsdt_ptr, ++ Err(_) => { ++ log::warn!( ++ "acpid: x_dsdt address out of range; falling back to 32-bit DSDT pointer" ++ ); ++ fadt.dsdt as usize ++ } ++ }, ++ _ => fadt.dsdt as usize, + }; + + log::debug!("FACP at {:X}", { dsdt_ptr }); + +- let dsdt_sdt = match Sdt::load_from_physical(fadt.dsdt as usize) { ++ let dsdt_sdt = match Sdt::load_from_physical(dsdt_ptr) { + Ok(dsdt) => dsdt, + Err(error) => { +- log::error!("Failed to load DSDT: {}", error); ++ log::error!( ++ "acpid: corrupt FADT/DSDT linkage (DSDT at {:#X}): booting without ACPI control services: {}", ++ dsdt_ptr, ++ error ++ ); + return; + } + }; +diff --git a/drivers/acpid/src/main.rs b/drivers/acpid/src/main.rs +index 059254b3..25566553 100644 +--- a/drivers/acpid/src/main.rs ++++ b/drivers/acpid/src/main.rs +@@ -3,6 +3,7 @@ use std::fs::File; + use std::mem; + use std::ops::ControlFlow; + use std::os::unix::io::AsRawFd; ++use std::process; + use std::sync::Arc; + + use ::acpi::aml::op_region::{RegionHandler, RegionSpace}; +@@ -28,94 +29,206 @@ fn daemon(daemon: daemon::Daemon) -> ! { + + log::info!("acpid start"); + +- let rxsdt_raw_data: Arc<[u8]> = std::fs::read("/scheme/kernel.acpi/rxsdt") +- .expect("acpid: failed to read `/scheme/kernel.acpi/rxsdt`") +- .into(); ++ let rxsdt_raw_data: Arc<[u8]> = match std::fs::read("/scheme/kernel.acpi/rxsdt") { ++ Ok(data) => data.into(), ++ Err(err) => { ++ log::error!("acpid: failed to read `/scheme/kernel.acpi/rxsdt`: {}", err); ++ process::exit(1); ++ } ++ }; + + if rxsdt_raw_data.is_empty() { + log::info!("System doesn't use ACPI"); + daemon.ready(); +- std::process::exit(0); ++ process::exit(0); + } + +- let sdt = self::acpi::Sdt::new(rxsdt_raw_data).expect("acpid: failed to parse [RX]SDT"); ++ // Root-table policy: if the kernel-provided [R|X]SDT is malformed, acpid cannot enumerate any ++ // firmware tables at all. That is fatal to this daemon, but it must fail with a logged exit ++ // rather than a panic on malformed firmware input. ++ let sdt = match self::acpi::Sdt::new(rxsdt_raw_data) { ++ Ok(sdt) => sdt, ++ Err(err) => { ++ log::error!("acpid: failed to parse kernel [R|X]SDT: {}", err); ++ process::exit(1); ++ } ++ }; ++ ++ // AML bootstrap contract: ++ // - preferred path: RSDP_ADDR[/RSDP_SIZE] inherited into acpid by the boot path, ++ // - x86 fallback: bounded BIOS RSDP search when that explicit handoff is absent or unusable. ++ let aml_bootstrap = match self::acpi::AmlBootstrap::from_env() { ++ Ok(bootstrap) => { ++ bootstrap.log_bootstrap(); ++ Some(bootstrap) ++ } ++ Err(err) => { ++ log::warn!( ++ "acpid: explicit AML bootstrap handoff unavailable ({}); trying x86 BIOS fallback", ++ err ++ ); + +- let mut thirty_two_bit; +- let mut sixty_four_bit; ++ match self::acpi::AmlBootstrap::x86_bios_fallback() { ++ Ok(Some(bootstrap)) => { ++ bootstrap.log_bootstrap(); ++ Some(bootstrap) ++ } ++ Ok(None) => { ++ log::warn!( ++ "acpid: AML bootstrap unavailable; continuing without AML-backed ACPI services" ++ ); ++ None ++ } ++ Err(err) => { ++ log::warn!( ++ "acpid: x86 BIOS AML bootstrap fallback failed ({}); continuing without AML-backed ACPI services", ++ err ++ ); ++ None ++ } ++ } ++ } ++ }; + +- let physaddrs_iter = match &sdt.signature { ++ let physaddrs = match &sdt.signature { + b"RSDT" => { +- thirty_two_bit = sdt +- .data() +- .chunks(mem::size_of::()) +- // TODO: With const generics, the compiler has some way of doing this for static sizes. +- .map(|chunk| <[u8; mem::size_of::()]>::try_from(chunk).unwrap()) +- .map(|chunk| u32::from_le_bytes(chunk)) +- .map(u64::from); +- +- &mut thirty_two_bit as &mut dyn Iterator ++ let chunks = sdt.data().chunks_exact(mem::size_of::()); ++ if !chunks.remainder().is_empty() { ++ log::error!("acpid: malformed RSDT payload length {}", sdt.data().len()); ++ process::exit(1); ++ } ++ ++ chunks ++ .map(|chunk| { ++ let chunk = <[u8; mem::size_of::()]>::try_from(chunk) ++ .map_err(|_| "invalid 32-bit RSDT entry width")?; ++ Ok(u64::from(u32::from_le_bytes(chunk))) ++ }) ++ .collect::, &str>>() + } + b"XSDT" => { +- sixty_four_bit = sdt +- .data() +- .chunks(mem::size_of::()) +- .map(|chunk| <[u8; mem::size_of::()]>::try_from(chunk).unwrap()) +- .map(|chunk| u64::from_le_bytes(chunk)); ++ let chunks = sdt.data().chunks_exact(mem::size_of::()); ++ if !chunks.remainder().is_empty() { ++ log::error!("acpid: malformed XSDT payload length {}", sdt.data().len()); ++ process::exit(1); ++ } + +- &mut sixty_four_bit as &mut dyn Iterator ++ chunks ++ .map(|chunk| { ++ let chunk = <[u8; mem::size_of::()]>::try_from(chunk) ++ .map_err(|_| "invalid 64-bit XSDT entry width")?; ++ Ok(u64::from_le_bytes(chunk)) ++ }) ++ .collect::, &str>>() ++ } ++ _ => { ++ log::error!( ++ "acpid: expected kernel root table to be RSDT or XSDT, got {}", ++ String::from_utf8_lossy(&sdt.signature) ++ ); ++ process::exit(1); ++ } ++ }; ++ let physaddrs = match physaddrs { ++ Ok(physaddrs) => physaddrs, ++ Err(err) => { ++ log::error!("acpid: failed to decode root table pointers: {}", err); ++ process::exit(1); + } +- _ => panic!("acpid: expected [RX]SDT from kernel to be either of those"), + }; + + let region_handlers: Vec<(RegionSpace, Box)> = vec![ + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + (RegionSpace::EmbeddedControl, Box::new(ec::Ec::new())), + ]; +- let acpi_context = self::acpi::AcpiContext::init(physaddrs_iter, region_handlers); ++ let acpi_context = self::acpi::AcpiContext::init(physaddrs.into_iter(), aml_bootstrap, region_handlers); + + // TODO: I/O permission bitmap? + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +- common::acquire_port_io_rights().expect("acpid: failed to set I/O privilege level to Ring 3"); ++ if let Err(err) = common::acquire_port_io_rights() { ++ log::error!( ++ "acpid: failed to set I/O privilege level to Ring 3: {}", ++ err ++ ); ++ process::exit(1); ++ } + +- let shutdown_pipe = File::open("/scheme/kernel.acpi/kstop") +- .expect("acpid: failed to open `/scheme/kernel.acpi/kstop`"); ++ let shutdown_pipe = match File::open("/scheme/kernel.acpi/kstop") { ++ Ok(file) => file, ++ Err(err) => { ++ log::error!("acpid: failed to open `/scheme/kernel.acpi/kstop`: {}", err); ++ process::exit(1); ++ } ++ }; + +- let mut event_queue = RawEventQueue::new().expect("acpid: failed to create event queue"); +- let socket = Socket::nonblock().expect("acpid: failed to create disk scheme"); ++ let mut event_queue = match RawEventQueue::new() { ++ Ok(event_queue) => event_queue, ++ Err(err) => { ++ log::error!("acpid: failed to create event queue: {}", err); ++ process::exit(1); ++ } ++ }; ++ let socket = match Socket::nonblock() { ++ Ok(socket) => socket, ++ Err(err) => { ++ log::error!("acpid: failed to create acpi scheme socket: {}", err); ++ process::exit(1); ++ } ++ }; + + let mut scheme = self::scheme::AcpiScheme::new(&acpi_context, &socket); + let mut handler = Blocking::new(&socket, 16); + +- event_queue +- .subscribe(shutdown_pipe.as_raw_fd() as usize, 0, EventFlags::READ) +- .expect("acpid: failed to register shutdown pipe for event queue"); +- event_queue +- .subscribe(socket.inner().raw(), 1, EventFlags::READ) +- .expect("acpid: failed to register scheme socket for event queue"); ++ if let Err(err) = event_queue.subscribe(shutdown_pipe.as_raw_fd() as usize, 0, EventFlags::READ) ++ { ++ log::error!( ++ "acpid: failed to register shutdown pipe for event queue: {}", ++ err ++ ); ++ process::exit(1); ++ } ++ if let Err(err) = event_queue.subscribe(socket.inner().raw(), 1, EventFlags::READ) { ++ log::error!( ++ "acpid: failed to register scheme socket for event queue: {}", ++ err ++ ); ++ process::exit(1); ++ } + +- register_sync_scheme(&socket, "acpi", &mut scheme) +- .expect("acpid: failed to register acpi scheme to namespace"); ++ if let Err(err) = register_sync_scheme(&socket, "acpi", &mut scheme) { ++ log::error!("acpid: failed to register acpi scheme to namespace: {}", err); ++ process::exit(1); ++ } + + daemon.ready(); + +- libredox::call::setrens(0, 0).expect("acpid: failed to enter null namespace"); ++ if let Err(err) = libredox::call::setrens(0, 0) { ++ log::error!("acpid: failed to enter null namespace: {}", err); ++ process::exit(1); ++ } + + let mut mounted = true; + while mounted { +- let Some(event) = event_queue +- .next() +- .transpose() +- .expect("acpid: failed to read event file") +- else { ++ let event = match event_queue.next().transpose() { ++ Ok(event) => event, ++ Err(err) => { ++ log::error!("acpid: failed to read event file: {}", err); ++ process::exit(1); ++ } ++ }; ++ let Some(event) = event else { + break; + }; + + if event.fd == socket.inner().raw() { + loop { +- match handler +- .process_requests_nonblocking(&mut scheme) +- .expect("acpid: failed to process requests") +- { ++ match match handler.process_requests_nonblocking(&mut scheme) { ++ Ok(flow) => flow, ++ Err(err) => { ++ log::error!("acpid: failed to process requests: {}", err); ++ process::exit(1); ++ } ++ } { + ControlFlow::Continue(()) => {} + ControlFlow::Break(()) => break, + } +diff --git a/drivers/acpid/src/scheme.rs b/drivers/acpid/src/scheme.rs +index 5a5040c3..6e57624a 100644 +--- a/drivers/acpid/src/scheme.rs ++++ b/drivers/acpid/src/scheme.rs +@@ -474,6 +474,8 @@ impl SchemeSync for AcpiScheme<'_, '_> { + return Err(Error::new(EINVAL)); + } else { + self.pci_fd = Some(new_fd); ++ self.ctx ++ .prime_shutdown_s5(self.pci_fd.as_ref(), "PCI-backed AML handoff"); + } + + Ok(num_fds) diff --git a/local/patches/base/P3-pcid-aer-scheme.patch b/local/patches/base/P3-pcid-aer-scheme.patch new file mode 100644 index 00000000..516e15fa --- /dev/null +++ b/local/patches/base/P3-pcid-aer-scheme.patch @@ -0,0 +1,398 @@ +--- a/drivers/pcid/src/cfg_access/mod.rs ++++ b/drivers/pcid/src/cfg_access/mod.rs +@@ -349,6 +349,10 @@ + let bus_addr = self.bus_addr(address.segment(), address.bus())?; + Some(unsafe { bus_addr.add(Self::bus_addr_offset_in_dwords(address, offset)) }) + } ++ ++ pub fn has_extended_config(&self, address: PciAddress) -> bool { ++ self.mmio_addr(address, 0x100).is_some() ++ } + } + + impl ConfigRegionAccess for Pcie { +--- a/drivers/pcid/src/scheme.rs ++++ b/drivers/pcid/src/scheme.rs +@@ -5,12 +5,61 @@ + use redox_scheme::{CallerCtx, OpenResult}; + use scheme_utils::HandleMap; + use syscall::dirent::{DirEntry, DirentBuf, DirentKind}; +-use syscall::error::{Error, Result, EACCES, EBADF, EINVAL, EIO, EISDIR, ENOENT, ENOTDIR, EALREADY}; ++use syscall::error::{ ++ Error, Result, EACCES, EALREADY, EBADF, EINVAL, EIO, EISDIR, ENOENT, ENOTDIR, EROFS, ++}; + use syscall::flag::{MODE_CHR, MODE_DIR, O_DIRECTORY, O_STAT}; + use syscall::schemev2::NewFdFlags; + use syscall::ENOLCK; + + use crate::cfg_access::Pcie; ++ ++const PCIE_EXTENDED_CAPABILITY_AER: u16 = 0x0001; ++ ++#[derive(Clone, Copy)] ++enum AerRegisterName { ++ UncorStatus, ++ UncorMask, ++ UncorSeverity, ++ CorStatus, ++ CorMask, ++ Cap, ++ HeaderLog, ++} ++ ++impl AerRegisterName { ++ fn from_path(path: &str) -> Option { ++ Some(match path { ++ "uncor_status" => Self::UncorStatus, ++ "uncor_mask" => Self::UncorMask, ++ "uncor_severity" => Self::UncorSeverity, ++ "cor_status" => Self::CorStatus, ++ "cor_mask" => Self::CorMask, ++ "cap" => Self::Cap, ++ "header_log" => Self::HeaderLog, ++ _ => return None, ++ }) ++ } ++ ++ const fn offset(self) -> u16 { ++ match self { ++ Self::UncorStatus => 0x00, ++ Self::UncorMask => 0x04, ++ Self::UncorSeverity => 0x08, ++ Self::CorStatus => 0x0C, ++ Self::CorMask => 0x10, ++ Self::Cap => 0x14, ++ Self::HeaderLog => 0x18, ++ } ++ } ++ ++ const fn len(self) -> usize { ++ match self { ++ Self::HeaderLog => 16, ++ _ => 4, ++ } ++ } ++} + + pub struct PciScheme { + handles: HandleMap, +@@ -20,13 +69,27 @@ + binds: HashMap, + } + enum Handle { +- TopLevel { entries: Vec }, ++ TopLevel { ++ entries: Vec, ++ }, + Access, +- Device, +- Channel { addr: PciAddress, st: ChannelState }, ++ Device { ++ addr: PciAddress, ++ }, ++ Channel { ++ addr: PciAddress, ++ st: ChannelState, ++ }, + SchemeRoot, + /// Represents an open handle to a device's bind endpoint +- Bind { addr: PciAddress }, ++ Bind { ++ addr: PciAddress, ++ }, ++ AerDir, ++ Aer { ++ addr: PciAddress, ++ register: AerRegisterName, ++ }, + /// Uevent surface for hotplug consumers. Opening uevent returns an object + /// from which device add/remove events can be read. Since pcid currently + /// only scans at startup, this surface is ready for hotplug polling consumers. +@@ -38,13 +101,23 @@ + } + impl Handle { + fn is_file(&self) -> bool { +- matches!(self, Self::Access | Self::Channel { .. } | Self::Bind { .. } | Self::Uevent) ++ matches!( ++ self, ++ Self::Access ++ | Self::Channel { .. } ++ | Self::Bind { .. } ++ | Self::Aer { .. } ++ | Self::Uevent ++ ) + } + fn is_dir(&self) -> bool { + !self.is_file() + } + fn requires_root(&self) -> bool { +- matches!(self, Self::Access | Self::Channel { .. } | Self::Bind { .. }) ++ matches!( ++ self, ++ Self::Access | Self::Channel { .. } | Self::Bind { .. } ++ ) + } + fn is_scheme_root(&self) -> bool { + matches!(self, Self::SchemeRoot) +@@ -57,6 +130,16 @@ + } + + const DEVICE_CONTENTS: &[&str] = &["channel", "bind"]; ++const DEVICE_AER_CONTENTS: &[&str] = &["channel", "bind", "aer"]; ++const AER_CONTENTS: &[&str] = &[ ++ "uncor_status", ++ "uncor_mask", ++ "uncor_severity", ++ "cor_status", ++ "cor_mask", ++ "cap", ++ "header_log", ++]; + + impl PciScheme { + pub fn access(&mut self) -> usize { +@@ -141,7 +224,12 @@ + + let (len, mode) = match handle.inner { + Handle::TopLevel { ref entries } => (entries.len(), MODE_DIR | 0o755), +- Handle::Device => (DEVICE_CONTENTS.len(), MODE_DIR | 0o755), ++ Handle::Device { addr } => ( ++ Self::device_entries(&self.pcie, addr).len(), ++ MODE_DIR | 0o755, ++ ), ++ Handle::AerDir => (AER_CONTENTS.len(), MODE_DIR | 0o755), ++ Handle::Aer { register, .. } => (register.len(), MODE_CHR | 0o444), + Handle::Access | Handle::Channel { .. } | Handle::Bind { .. } => (0, MODE_CHR | 0o600), + Handle::Uevent => (0, MODE_CHR | 0o644), + Handle::SchemeRoot => return Err(Error::new(EBADF)), +@@ -154,7 +242,7 @@ + &mut self, + id: usize, + buf: &mut [u8], +- _offset: u64, ++ offset: u64, + _fcntl_flags: u32, + _ctx: &CallerCtx, + ) -> Result { +@@ -166,11 +254,14 @@ + + match handle.inner { + Handle::TopLevel { .. } => Err(Error::new(EISDIR)), +- Handle::Device => Err(Error::new(EISDIR)), ++ Handle::Device { .. } | Handle::AerDir => Err(Error::new(EISDIR)), + Handle::Channel { + addr: _, + ref mut st, + } => Self::read_channel(st, buf), ++ Handle::Aer { addr, register } => { ++ Self::read_aer_register(&self.pcie, addr, register, buf, offset) ++ } + Handle::Uevent => { + // Uevent surface is ready for hotplug polling consumers. + // pcid currently only scans at startup, so return empty (EAGAIN would indicate no data available). +@@ -209,8 +300,15 @@ + } + return Ok(buf); + } +- Handle::Device => DEVICE_CONTENTS, +- Handle::Access | Handle::Channel { .. } | Handle::Bind { .. } | Handle::Uevent => return Err(Error::new(ENOTDIR)), ++ Handle::Device { addr } => Self::device_entries(&self.pcie, addr), ++ Handle::AerDir => AER_CONTENTS, ++ Handle::Access ++ | Handle::Channel { .. } ++ | Handle::Bind { .. } ++ | Handle::Aer { .. } ++ | Handle::Uevent => { ++ return Err(Error::new(ENOTDIR)); ++ } + Handle::SchemeRoot => return Err(Error::new(EBADF)), + }; + +@@ -243,6 +341,7 @@ + Handle::Channel { addr, ref mut st } => { + Self::write_channel(&self.pcie, &mut self.tree, addr, st, buf) + } ++ Handle::Aer { .. } => Err(Error::new(EROFS)), + + _ => Err(Error::new(EBADF)), + } +@@ -357,45 +456,151 @@ + binds: HashMap::new(), + } + } +- fn parse_after_pci_addr(&mut self, addr: PciAddress, after: &str, ctx: &CallerCtx) -> Result { ++ fn device_entries(pcie: &Pcie, addr: PciAddress) -> &'static [&'static str] { ++ if Self::find_pcie_extended_capability(pcie, addr, PCIE_EXTENDED_CAPABILITY_AER).is_some() { ++ DEVICE_AER_CONTENTS ++ } else { ++ DEVICE_CONTENTS ++ } ++ } ++ fn find_pcie_extended_capability( ++ pcie: &Pcie, ++ addr: PciAddress, ++ capability_id: u16, ++ ) -> Option { ++ if !pcie.has_extended_config(addr) { ++ return None; ++ } ++ ++ let mut offset = 0x100_u16; ++ ++ while offset <= 0xFFC { ++ let header = unsafe { pcie.read(addr, offset) }; ++ if header == 0 || header == u32::MAX { ++ return None; ++ } ++ ++ if (header & 0xFFFF) as u16 == capability_id { ++ return Some(offset); ++ } ++ ++ let next = ((header >> 20) & 0xFFF) as u16; ++ if next < 0x100 || next <= offset || next > 0xFFC || next % 4 != 0 { ++ return None; ++ } ++ offset = next; ++ } ++ ++ None ++ } ++ fn read_file_bytes(data: &[u8], buf: &mut [u8], offset: u64) -> Result { ++ let Ok(offset) = usize::try_from(offset) else { ++ return Ok(0); ++ }; ++ if offset >= data.len() { ++ return Ok(0); ++ } ++ ++ let count = std::cmp::min(buf.len(), data.len() - offset); ++ buf[..count].copy_from_slice(&data[offset..offset + count]); ++ Ok(count) ++ } ++ fn read_aer_register( ++ pcie: &Pcie, ++ addr: PciAddress, ++ register: AerRegisterName, ++ buf: &mut [u8], ++ offset: u64, ++ ) -> Result { ++ let Some(aer_base) = ++ Self::find_pcie_extended_capability(pcie, addr, PCIE_EXTENDED_CAPABILITY_AER) ++ else { ++ return Err(Error::new(ENOENT)); ++ }; ++ ++ let mut data = [0_u8; 16]; ++ for (index, chunk) in data[..register.len()].chunks_exact_mut(4).enumerate() { ++ let index = u16::try_from(index).map_err(|_| Error::new(EIO))?; ++ let value = unsafe { pcie.read(addr, aer_base + register.offset() + index * 4) }; ++ chunk.copy_from_slice(&value.to_le_bytes()); ++ } ++ ++ Self::read_file_bytes(&data[..register.len()], buf, offset) ++ } ++ fn parse_after_pci_addr( ++ &mut self, ++ addr: PciAddress, ++ after: &str, ++ ctx: &CallerCtx, ++ ) -> Result { + if after.chars().next().map_or(false, |c| c != '/') { + return Err(Error::new(ENOENT)); + } + let func = self.tree.get_mut(&addr).ok_or(Error::new(ENOENT))?; + + Ok(if after.is_empty() { +- Handle::Device ++ Handle::Device { addr } + } else { + let path = &after[1..]; + +- match path { +- "channel" => { +- if func.enabled { +- return Err(Error::new(ENOLCK)); ++ if path == "aer" { ++ if Self::find_pcie_extended_capability( ++ &self.pcie, ++ addr, ++ PCIE_EXTENDED_CAPABILITY_AER, ++ ) ++ .is_none() ++ { ++ return Err(Error::new(ENOENT)); ++ } ++ Handle::AerDir ++ } else if let Some(register_name) = path.strip_prefix("aer/") { ++ let register = ++ AerRegisterName::from_path(register_name).ok_or(Error::new(ENOENT))?; ++ if Self::find_pcie_extended_capability( ++ &self.pcie, ++ addr, ++ PCIE_EXTENDED_CAPABILITY_AER, ++ ) ++ .is_none() ++ { ++ return Err(Error::new(ENOENT)); ++ } ++ Handle::Aer { addr, register } ++ } else { ++ match path { ++ "channel" => { ++ if func.enabled { ++ return Err(Error::new(ENOLCK)); ++ } ++ func.inner.legacy_interrupt_line = crate::enable_function( ++ &self.pcie, ++ &mut func.endpoint_header, ++ &mut func.capabilities, ++ ); ++ func.enabled = true; ++ Handle::Channel { ++ addr, ++ st: ChannelState::AwaitingData, ++ } + } +- func.inner.legacy_interrupt_line = crate::enable_function( +- &self.pcie, +- &mut func.endpoint_header, +- &mut func.capabilities, +- ); +- func.enabled = true; +- Handle::Channel { +- addr, +- st: ChannelState::AwaitingData, ++ "bind" => { ++ let addr_str = format!("{}", addr); ++ if let Some(&owner_pid) = self.binds.get(&addr_str) { ++ log::info!( ++ "pcid: device {} already bound by pid {}", ++ addr_str, ++ owner_pid ++ ); ++ return Err(Error::new(EALREADY)); ++ } ++ let caller_pid = u32::try_from(ctx.pid).map_err(|_| Error::new(EINVAL))?; ++ self.binds.insert(addr_str.clone(), caller_pid); ++ log::info!("pcid: device {} bound by pid {}", addr_str, caller_pid); ++ Handle::Bind { addr } + } +- } +- "bind" => { +- let addr_str = format!("{}", addr); +- if let Some(&owner_pid) = self.binds.get(&addr_str) { +- log::info!("pcid: device {} already bound by pid {}", addr_str, owner_pid); +- return Err(Error::new(EALREADY)); +- } +- let caller_pid = ctx.pid; +- self.binds.insert(addr_str.clone(), caller_pid); +- log::info!("pcid: device {} bound by pid {}", addr_str, caller_pid); +- Handle::Bind { addr } +- } +- _ => return Err(Error::new(ENOENT)), ++ _ => return Err(Error::new(ENOENT)), ++ } + } + }) + } diff --git a/local/patches/base/P3-pcid-bind-scheme.patch b/local/patches/base/P3-pcid-bind-scheme.patch new file mode 100644 index 00000000..1cdf39f2 --- /dev/null +++ b/local/patches/base/P3-pcid-bind-scheme.patch @@ -0,0 +1,182 @@ +diff --git a/drivers/pcid/src/scheme.rs b/drivers/pcid/src/scheme.rs +index bb9f39a3..06be6267 100644 +--- a/drivers/pcid/src/scheme.rs ++++ b/drivers/pcid/src/scheme.rs +@@ -1,11 +1,11 @@ +-use std::collections::{BTreeMap, VecDeque}; ++use std::collections::{BTreeMap, HashMap, VecDeque}; + + use pci_types::{ConfigRegionAccess, PciAddress}; + use redox_scheme::scheme::SchemeSync; + use redox_scheme::{CallerCtx, OpenResult}; + use scheme_utils::HandleMap; + use syscall::dirent::{DirEntry, DirentBuf, DirentKind}; +-use syscall::error::{Error, Result, EACCES, EBADF, EINVAL, EIO, EISDIR, ENOENT, ENOTDIR}; ++use syscall::error::{Error, Result, EACCES, EBADF, EINVAL, EIO, EISDIR, ENOENT, ENOTDIR, EALREADY}; + use syscall::flag::{MODE_CHR, MODE_DIR, O_DIRECTORY, O_STAT}; + use syscall::schemev2::NewFdFlags; + use syscall::ENOLCK; +@@ -16,6 +16,8 @@ pub struct PciScheme { + handles: HandleMap, + pub pcie: Pcie, + pub tree: BTreeMap, ++ /// Maps device address string (e.g. "0000:00:14.0") to owning PID ++ binds: HashMap, + } + enum Handle { + TopLevel { entries: Vec }, +@@ -23,6 +25,12 @@ enum Handle { + Device, + Channel { addr: PciAddress, st: ChannelState }, + SchemeRoot, ++ /// Represents an open handle to a device's bind endpoint ++ Bind { addr: PciAddress }, ++ /// Uevent surface for hotplug consumers. Opening uevent returns an object ++ /// from which device add/remove events can be read. Since pcid currently ++ /// only scans at startup, this surface is ready for hotplug polling consumers. ++ Uevent, + } + struct HandleWrapper { + inner: Handle, +@@ -30,14 +38,13 @@ struct HandleWrapper { + } + impl Handle { + fn is_file(&self) -> bool { +- matches!(self, Self::Access | Self::Channel { .. }) ++ matches!(self, Self::Access | Self::Channel { .. } | Self::Bind { .. } | Self::Uevent) + } + fn is_dir(&self) -> bool { + !self.is_file() + } +- // TODO: capability rather than root + fn requires_root(&self) -> bool { +- matches!(self, Self::Access | Self::Channel { .. }) ++ matches!(self, Self::Access | Self::Channel { .. } | Self::Bind { .. }) + } + fn is_scheme_root(&self) -> bool { + matches!(self, Self::SchemeRoot) +@@ -49,7 +56,7 @@ enum ChannelState { + AwaitingResponseRead(VecDeque), + } + +-const DEVICE_CONTENTS: &[&str] = &["channel"]; ++const DEVICE_CONTENTS: &[&str] = &["channel", "bind"]; + + impl PciScheme { + pub fn access(&mut self) -> usize { +@@ -88,22 +95,25 @@ impl SchemeSync for PciScheme { + let path = path.trim_matches('/'); + + let handle = if path.is_empty() { +- Handle::TopLevel { +- entries: self +- .tree +- .iter() +- // FIXME remove replacement of : once the old scheme format is no longer supported. +- .map(|(addr, _)| format!("{}", addr).replace(':', "--")) +- .collect::>(), +- } ++ let mut entries: Vec = self ++ .tree ++ .iter() ++ // FIXME remove replacement of : once the old scheme format is no longer supported. ++ .map(|(addr, _)| format!("{}", addr).replace(':', "--")) ++ .collect(); ++ entries.push(String::from("uevent")); ++ entries.push(String::from("access")); ++ Handle::TopLevel { entries } + } else if path == "access" { + Handle::Access ++ } else if path == "uevent" { ++ Handle::Uevent + } else { + let idx = path.find('/').unwrap_or(path.len()); + let (addr_str, after) = path.split_at(idx); + let addr = parse_pci_addr(addr_str).ok_or(Error::new(ENOENT))?; + +- self.parse_after_pci_addr(addr, after)? ++ self.parse_after_pci_addr(addr, after, ctx)? + }; + + let stat = flags & O_STAT != 0; +@@ -132,7 +142,8 @@ impl SchemeSync for PciScheme { + let (len, mode) = match handle.inner { + Handle::TopLevel { ref entries } => (entries.len(), MODE_DIR | 0o755), + Handle::Device => (DEVICE_CONTENTS.len(), MODE_DIR | 0o755), +- Handle::Access | Handle::Channel { .. } => (0, MODE_CHR | 0o600), ++ Handle::Access | Handle::Channel { .. } | Handle::Bind { .. } => (0, MODE_CHR | 0o600), ++ Handle::Uevent => (0, MODE_CHR | 0o644), + Handle::SchemeRoot => return Err(Error::new(EBADF)), + }; + stat.st_size = len as u64; +@@ -160,7 +171,13 @@ impl SchemeSync for PciScheme { + addr: _, + ref mut st, + } => Self::read_channel(st, buf), +- Handle::SchemeRoot => Err(Error::new(EBADF)), ++ Handle::Uevent => { ++ // Uevent surface is ready for hotplug polling consumers. ++ // pcid currently only scans at startup, so return empty (EAGAIN would indicate no data available). ++ // Consumers can poll and re-read to check for new events. ++ Ok(0) ++ } ++ Handle::SchemeRoot | Handle::Bind { .. } => Err(Error::new(EBADF)), + _ => Err(Error::new(EBADF)), + } + } +@@ -193,7 +210,7 @@ impl SchemeSync for PciScheme { + return Ok(buf); + } + Handle::Device => DEVICE_CONTENTS, +- Handle::Access | Handle::Channel { .. } => return Err(Error::new(ENOTDIR)), ++ Handle::Access | Handle::Channel { .. } | Handle::Bind { .. } | Handle::Uevent => return Err(Error::new(ENOTDIR)), + Handle::SchemeRoot => return Err(Error::new(EBADF)), + }; + +@@ -316,6 +333,16 @@ impl SchemeSync for PciScheme { + func.enabled = false; + } + } ++ Some(HandleWrapper { ++ inner: Handle::Bind { addr }, ++ .. ++ }) => { ++ let addr_str = format!("{}", addr); ++ if let Some(&owner_pid) = self.binds.get(&addr_str) { ++ log::info!("pcid: device {} unbound by pid {}", addr_str, owner_pid); ++ } ++ self.binds.remove(&addr_str); ++ } + _ => {} + } + } +@@ -327,9 +354,10 @@ impl PciScheme { + handles: HandleMap::new(), + pcie, + tree: BTreeMap::new(), ++ binds: HashMap::new(), + } + } +- fn parse_after_pci_addr(&mut self, addr: PciAddress, after: &str) -> Result { ++ fn parse_after_pci_addr(&mut self, addr: PciAddress, after: &str, ctx: &CallerCtx) -> Result { + if after.chars().next().map_or(false, |c| c != '/') { + return Err(Error::new(ENOENT)); + } +@@ -356,6 +384,17 @@ impl PciScheme { + st: ChannelState::AwaitingData, + } + } ++ "bind" => { ++ let addr_str = format!("{}", addr); ++ if let Some(&owner_pid) = self.binds.get(&addr_str) { ++ log::info!("pcid: device {} already bound by pid {}", addr_str, owner_pid); ++ return Err(Error::new(EALREADY)); ++ } ++ let caller_pid = ctx.pid; ++ self.binds.insert(addr_str.clone(), caller_pid); ++ log::info!("pcid: device {} bound by pid {}", addr_str, caller_pid); ++ Handle::Bind { addr } ++ } + _ => return Err(Error::new(ENOENT)), + } + }) diff --git a/local/patches/base/P3-xhci-device-hardening.patch b/local/patches/base/P3-xhci-device-hardening.patch new file mode 100644 index 00000000..4b76cd60 --- /dev/null +++ b/local/patches/base/P3-xhci-device-hardening.patch @@ -0,0 +1,1193 @@ +diff --git a/drivers/usb/xhcid/src/xhci/irq_reactor.rs b/drivers/usb/xhcid/src/xhci/irq_reactor.rs +index ac492d5b..68317b2d 100644 +--- a/drivers/usb/xhcid/src/xhci/irq_reactor.rs ++++ b/drivers/usb/xhcid/src/xhci/irq_reactor.rs +@@ -8,8 +8,11 @@ use std::task; + use std::os::unix::io::AsRawFd; + + use crossbeam_channel::{Receiver, Sender}; ++use futures::task::noop_waker; + use log::{debug, error, info, trace, warn}; + ++use common::timeout::Timeout; ++ + use super::doorbell::Doorbell; + use super::event::EventRing; + use super::ring::Ring; +@@ -44,6 +47,30 @@ pub struct NextEventTrb { + pub src_trb: Option, + } + ++pub struct PendingEventWait { ++ message: Arc>>, ++} ++ ++impl PendingEventWait { ++ pub fn wait_timeout(&self, timeout: std::time::Duration) -> Option { ++ let timeout = Timeout::new(timeout); ++ ++ loop { ++ let Ok(mut message) = self.message.lock() else { ++ return None; ++ }; ++ ++ if let Some(message) = message.take() { ++ return Some(message); ++ } ++ ++ drop(message); ++ ++ timeout.run().ok()?; ++ } ++ } ++} ++ + // TODO: Perhaps all of the transfer rings used by the xHC should be stored linearly, and then + // indexed using this struct instead. + #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +@@ -626,6 +653,27 @@ impl Future for EventTrbFuture { + } + + impl Xhci { ++ pub fn queue_command_completion_wait( ++ &self, ++ command_ring: &Ring, ++ trb: &Trb, ++ doorbell: EventDoorbell, ++ ) -> PendingEventWait { ++ let message = Arc::new(Mutex::new(None)); ++ let send_result = self.irq_reactor_sender.send(State { ++ waker: noop_waker(), ++ kind: StateKind::CommandCompletion { ++ phys_ptr: command_ring.trb_phys_ptr(self.cap.ac64(), trb), ++ }, ++ message: Arc::clone(&message), ++ is_isoch_or_vf: false, ++ }); ++ if send_result.is_ok() { ++ doorbell.ring(); ++ } ++ PendingEventWait { message } ++ } ++ + pub fn get_transfer_trb(&self, paddr: u64, id: RingId) -> Option { + self.with_ring(id, |ring| ring.phys_addr_to_entry(self.cap.ac64(), paddr)) + .flatten() +diff --git a/drivers/usb/xhcid/src/xhci/mod.rs b/drivers/usb/xhcid/src/xhci/mod.rs +index f2143676..f406d16e 100644 +--- a/drivers/usb/xhcid/src/xhci/mod.rs ++++ b/drivers/usb/xhcid/src/xhci/mod.rs +@@ -12,10 +12,11 @@ + use std::collections::BTreeMap; + use std::convert::TryFrom; + use std::fs::File; +-use std::sync::atomic::AtomicUsize; ++use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; + use std::sync::{Arc, Mutex}; + + use std::{mem, process, slice, thread}; ++use std::time::Duration; + use syscall::error::{Error, Result, EBADF, EBADMSG, EIO, ENOENT}; + use syscall::{EAGAIN, PAGE_SIZE}; + +@@ -54,7 +55,7 @@ use self::event::EventRing; + use self::extended::{CapabilityId, ExtendedCapabilitiesIter, ProtocolSpeed, SupportedProtoCap}; + use self::irq_reactor::{EventDoorbell, IrqReactor, NewPendingTrb, RingId}; + use self::operational::*; +-use self::port::Port; ++use self::port::{Port, PortLinkState}; + use self::ring::Ring; + use self::runtime::RuntimeRegs; + use self::trb::{TransferKind, Trb, TrbCompletionCode}; +@@ -77,7 +78,174 @@ pub enum InterruptMethod { + Msi, + } + ++const ATTACH_STEP_COUNT: usize = 6; ++const DETACH_TIMEOUT: Duration = Duration::from_millis(100); ++ ++#[derive(Clone, Copy, Debug, Eq, PartialEq)] ++enum PortAttachmentState { ++ Pending, ++ Attached, ++ Detaching, ++} ++ ++struct PortRuntime { ++ detaching: AtomicBool, ++ inflight_transfers: AtomicUsize, ++} ++ ++impl PortRuntime { ++ fn new() -> Arc { ++ Arc::new(Self { ++ detaching: AtomicBool::new(false), ++ inflight_transfers: AtomicUsize::new(0), ++ }) ++ } ++ ++ fn begin_transfer(self: &Arc) -> Result { ++ if self.detaching.load(Ordering::SeqCst) { ++ return Err(Error::new(EAGAIN)); ++ } ++ ++ self.inflight_transfers.fetch_add(1, Ordering::SeqCst); ++ ++ if self.detaching.load(Ordering::SeqCst) { ++ self.inflight_transfers.fetch_sub(1, Ordering::SeqCst); ++ return Err(Error::new(EAGAIN)); ++ } ++ ++ Ok(InflightTransferGuard { ++ runtime: Arc::clone(self), ++ }) ++ } ++ ++ fn begin_detach(&self) { ++ self.detaching.store(true, Ordering::SeqCst); ++ } ++ ++ fn inflight_transfers(&self) -> usize { ++ self.inflight_transfers.load(Ordering::SeqCst) ++ } ++} ++ ++struct InflightTransferGuard { ++ runtime: Arc, ++} ++ ++impl Drop for InflightTransferGuard { ++ fn drop(&mut self) { ++ self.runtime.inflight_transfers.fetch_sub(1, Ordering::SeqCst); ++ } ++} ++ + impl Xhci { ++ fn any_port_state(&self, port: PortId) -> Result>> { ++ self.port_states ++ .get(&port) ++ .or_else(|| self.pending_port_states.get(&port)) ++ .ok_or(Error::new(ENOENT)) ++ } ++ ++ fn any_port_state_mut( ++ &self, ++ port: PortId, ++ ) -> Result>> { ++ self.port_states ++ .get_mut(&port) ++ .or_else(|| self.pending_port_states.get_mut(&port)) ++ .ok_or(Error::new(ENOENT)) ++ } ++ ++ pub(crate) fn begin_transfer_guard(&self, port: PortId) -> Result { ++ let port_state = self.port_states.get(&port).ok_or(Error::new(ENOENT))?; ++ port_state.runtime.begin_transfer() ++ } ++ ++ fn begin_internal_transfer_guard(&self, port: PortId) -> Result { ++ let port_state = self.any_port_state(port)?; ++ port_state.runtime.begin_transfer() ++ } ++ ++ fn wait_for_transfer_drain(&self, runtime: &PortRuntime, timeout: Duration) -> bool { ++ let timeout = Timeout::new(timeout); ++ loop { ++ if runtime.inflight_transfers() == 0 { ++ return true; ++ } ++ ++ if timeout.run().is_err() { ++ return false; ++ } ++ } ++ } ++ ++ fn attach_log_step(slot: u8, step: usize, step_name: &str) { ++ info!( ++ "xhcid: attach step {}/{} for slot {}: {}", ++ step, ATTACH_STEP_COUNT, slot, step_name ++ ); ++ } ++ ++ async fn rollback_attach(&self, port_id: PortId, slot: u8, step: usize) -> Result<()> { ++ warn!( ++ "xhcid: attach failed at step {}, rolling back slot {}", ++ step, slot ++ ); ++ self.pending_port_states.remove(&port_id); ++ ++ match self.disable_port_slot(slot).await { ++ Ok(()) => Ok(()), ++ Err(err) => { ++ warn!( ++ "xhcid: failed to disable slot {} during attach rollback on port {}: {}", ++ slot, port_id, err ++ ); ++ Err(err) ++ } ++ } ++ } ++ ++ fn current_link_state(&self, port_id: PortId) -> Option { ++ PortLinkState::from_port_state(self.get_pls(port_id)) ++ } ++ ++ fn transition_link_state(&self, port_id: PortId, slot: u8, to: PortLinkState) { ++ let from = self.current_link_state(port_id); ++ ++ if from == Some(to) { ++ return; ++ } ++ ++ if let Some(from) = from { ++ info!( ++ "xhcid: PM slot {}: {}→{}", ++ slot, ++ from.as_str(), ++ to.as_str() ++ ); ++ } ++ ++ if let Ok(mut ports) = self.ports.lock() { ++ if let Some(port) = ports.get_mut(port_id.root_hub_port_index()) { ++ port.set_link_state(to); ++ } ++ } ++ } ++ ++ fn wake_port_to_u0(&self, port_id: PortId, slot: u8) { ++ if self.current_link_state(port_id) == Some(PortLinkState::U3) { ++ self.transition_link_state(port_id, slot, PortLinkState::U0); ++ } ++ } ++ ++ fn quiesce_port_to_u3(&self, port_id: PortId, slot: u8) { ++ if self.current_link_state(port_id) == Some(PortLinkState::U0) { ++ self.transition_link_state(port_id, slot, PortLinkState::U2); ++ } ++ if self.current_link_state(port_id) == Some(PortLinkState::U2) { ++ self.transition_link_state(port_id, slot, PortLinkState::U3); ++ } ++ } ++ + /// Gets descriptors, before the port state is initiated. + async fn get_desc_raw( + &self, +@@ -103,8 +271,11 @@ impl Xhci { + len + ); + ++ self.wake_port_to_u0(port, slot); ++ let _transfer_guard = self.begin_internal_transfer_guard(port)?; ++ + let future = { +- let mut port_state = self.port_states.get_mut(&port).ok_or(Error::new(ENOENT))?; ++ let mut port_state = self.any_port_state_mut(port)?; + let ring = port_state + .endpoint_states + .get_mut(&0) +@@ -283,6 +454,7 @@ pub struct Xhci { + handles: CHashMap, + next_handle: AtomicUsize, + port_states: CHashMap>, ++ pending_port_states: CHashMap>, + drivers: CHashMap>, + scheme_name: String, + +@@ -305,9 +477,11 @@ unsafe impl Send for Xhci {} + unsafe impl Sync for Xhci {} + + struct PortState { ++ attachment_state: PortAttachmentState, + slot: u8, + protocol_speed: &'static ProtocolSpeed, + cfg_idx: Option, ++ runtime: Arc, + input_context: Mutex>>, + dev_desc: Option, + endpoint_states: BTreeMap, +@@ -463,6 +637,7 @@ impl Xhci { + handles: CHashMap::new(), + next_handle: AtomicUsize::new(0), + port_states: CHashMap::new(), ++ pending_port_states: CHashMap::new(), + drivers: CHashMap::new(), + scheme_name, + +@@ -793,7 +968,8 @@ impl Xhci { + } + + pub async fn attach_device(&self, port_id: PortId) -> syscall::Result<()> { +- if self.port_states.contains_key(&port_id) { ++ if self.port_states.contains_key(&port_id) || self.pending_port_states.contains_key(&port_id) ++ { + debug!("Already contains port {}", port_id); + return Err(syscall::Error::new(EAGAIN)); + } +@@ -822,10 +998,12 @@ impl Xhci { + let slot = match self.enable_port_slot(slot_ty).await { + Ok(ok) => ok, + Err(err) => { ++ warn!("xhcid: attach failed at step 1, rolling back slot 0"); + error!("Failed to enable slot for port {}: {}", port_id, err); + return Err(err); + } + }; ++ Self::attach_log_step(slot, 1, "enable_slot"); + + debug!("Enabled port {}, which the xHC mapped to {}", port_id, slot); + +@@ -836,6 +1014,7 @@ impl Xhci { + + let mut input = unsafe { self.alloc_dma_zeroed::>()? }; + ++ Self::attach_log_step(slot, 2, "address_device"); + debug!("Attempting to address the device"); + let mut ring = match self + .address_device(&mut input, port_id, slot_ty, slot, protocol_speed, speed) +@@ -844,6 +1023,7 @@ impl Xhci { + Ok(device_ring) => device_ring, + Err(err) => { + error!("Failed to address device for port {}: `{}`", port_id, err); ++ let _ = self.rollback_attach(port_id, slot, 2).await; + return Err(err); + } + }; +@@ -853,11 +1033,13 @@ impl Xhci { + // TODO: Should the descriptors be cached in PortState, or refetched? + + let mut port_state = PortState { ++ attachment_state: PortAttachmentState::Pending, + slot, + protocol_speed, + input_context: Mutex::new(input), + dev_desc: None, + cfg_idx: None, ++ runtime: PortRuntime::new(), + endpoint_states: std::iter::once(( + 0, + EndpointState { +@@ -867,42 +1049,81 @@ impl Xhci { + )) + .collect::>(), + }; +- self.port_states.insert(port_id, port_state); +- debug!("Got port states!"); ++ self.pending_port_states.insert(port_id, port_state); ++ debug!("Got pending port states!"); + + // Ensure correct packet size is used +- let dev_desc_8_byte = self.fetch_dev_desc_8_byte(port_id, slot).await?; ++ Self::attach_log_step(slot, 3, "fetch_device_descriptor_8"); ++ let dev_desc_8_byte = match self.fetch_dev_desc_8_byte(port_id, slot).await { ++ Ok(desc) => desc, ++ Err(err) => { ++ let _ = self.rollback_attach(port_id, slot, 3).await; ++ return Err(err); ++ } ++ }; + { +- let mut port_state = self.port_states.get_mut(&port_id).unwrap(); +- +- let mut input = port_state.input_context.lock().unwrap(); +- +- self.update_max_packet_size(&mut *input, slot, dev_desc_8_byte) +- .await?; ++ let mut port_state = self.any_port_state_mut(port_id)?; ++ ++ let mut input = port_state ++ .input_context ++ .lock() ++ .map_err(|_| Error::new(EIO))?; ++ ++ Self::attach_log_step(slot, 4, "update_max_packet_size"); ++ if let Err(err) = self.update_max_packet_size(&mut *input, slot, dev_desc_8_byte).await ++ { ++ drop(input); ++ drop(port_state); ++ let _ = self.rollback_attach(port_id, slot, 4).await; ++ return Err(err); ++ } + } + + debug!("Got the 8 byte dev descriptor: {:X?}", dev_desc_8_byte); + +- let dev_desc = self.get_desc(port_id, slot).await?; ++ Self::attach_log_step(slot, 5, "fetch_device_descriptor"); ++ let dev_desc = match self.get_desc(port_id, slot).await { ++ Ok(desc) => desc, ++ Err(err) => { ++ let _ = self.rollback_attach(port_id, slot, 5).await; ++ return Err(err); ++ } ++ }; + debug!("Got the full device descriptor!"); +- self.port_states.get_mut(&port_id).unwrap().dev_desc = Some(dev_desc); ++ self.any_port_state_mut(port_id)?.dev_desc = Some(dev_desc); + + debug!("Got the port states again!"); + { +- let mut port_state = self.port_states.get_mut(&port_id).unwrap(); ++ let mut port_state = self.any_port_state_mut(port_id)?; + +- let mut input = port_state.input_context.lock().unwrap(); ++ let mut input = port_state ++ .input_context ++ .lock() ++ .map_err(|_| Error::new(EIO))?; + debug!("Got the input context!"); +- let dev_desc = port_state.dev_desc.as_ref().unwrap(); +- +- self.update_default_control_pipe(&mut *input, slot, dev_desc) +- .await?; ++ let dev_desc = port_state.dev_desc.as_ref().ok_or(Error::new(EIO))?; ++ ++ Self::attach_log_step(slot, 6, "configure_default_control_pipe"); ++ if let Err(err) = self.update_default_control_pipe(&mut *input, slot, dev_desc).await ++ { ++ drop(input); ++ drop(port_state); ++ let _ = self.rollback_attach(port_id, slot, 6).await; ++ return Err(err); ++ } + } + + debug!("Updated the default control pipe"); + ++ if let Some(mut published_state) = self.pending_port_states.remove(&port_id) { ++ published_state.attachment_state = PortAttachmentState::Attached; ++ self.port_states.insert(port_id, published_state); ++ } ++ + match self.spawn_drivers(port_id) { +- Ok(()) => (), ++ Ok(()) => { ++ info!("xhcid: uevent add device usb/{}", port_id.root_hub_port_num); ++ } + Err(err) => { + error!("Failed to spawn driver for port {}: `{}`", port_id, err) + } +@@ -915,6 +1136,32 @@ impl Xhci { + } + + pub async fn detach_device(&self, port_id: PortId) -> Result { ++ if let Some(mut pending_state) = self.pending_port_states.remove(&port_id) { ++ pending_state.attachment_state = PortAttachmentState::Detaching; ++ pending_state.runtime.begin_detach(); ++ let _ = self.rollback_attach(port_id, pending_state.slot, ATTACH_STEP_COUNT).await; ++ return Ok(true); ++ } ++ ++ let (slot, runtime, endpoints) = match self.port_states.get_mut(&port_id) { ++ Some(mut state) => { ++ state.attachment_state = PortAttachmentState::Detaching; ++ state.runtime.begin_detach(); ++ ( ++ state.slot, ++ Arc::clone(&state.runtime), ++ state.endpoint_states.keys().copied().collect::>(), ++ ) ++ } ++ None => { ++ debug!( ++ "Attempted to detach from port {}, which wasn't previously attached.", ++ port_id ++ ); ++ return Ok(false); ++ } ++ }; ++ + if let Some(children) = self.drivers.remove(&port_id) { + for mut child in children { + info!("killing driver process {} for port {}", child.id(), port_id); +@@ -962,21 +1209,38 @@ impl Xhci { + } + } + +- if let Some(state) = self.port_states.remove(&port_id) { +- debug!("disabling port slot {} for port {}", state.slot, port_id); +- let result = self.disable_port_slot(state.slot).await.and(Ok(true)); +- debug!( +- "disabled port slot {} for port {} with result: {:?}", +- state.slot, port_id, result +- ); +- result +- } else { +- debug!( +- "Attempted to detach from port {}, which wasn't previously attached.", +- port_id +- ); +- Ok(false) ++ let drained = self.wait_for_transfer_drain(&runtime, DETACH_TIMEOUT); ++ ++ self.wake_port_to_u0(port_id, slot); ++ ++ let mut timed_out = !drained; ++ for endp_num in endpoints { ++ if timed_out { ++ break; ++ } ++ ++ if !self.stop_endpoint_with_timeout(port_id, slot, endp_num, DETACH_TIMEOUT).await? { ++ timed_out = true; ++ break; ++ } + } ++ ++ self.quiesce_port_to_u3(port_id, slot); ++ ++ if timed_out { ++ warn!("xhcid: forced detach slot {} after timeout", slot); ++ } ++ ++ debug!("disabling port slot {} for port {}", slot, port_id); ++ let result = self.disable_port_slot(slot).await.and(Ok(true)); ++ debug!( ++ "disabled port slot {} for port {} with result: {:?}", ++ slot, port_id, result ++ ); ++ ++ self.port_states.remove(&port_id); ++ info!("xhcid: uevent remove device usb/{}", port_id.root_hub_port_num); ++ result + } + + pub async fn update_max_packet_size( +diff --git a/drivers/usb/xhcid/src/xhci/port.rs b/drivers/usb/xhcid/src/xhci/port.rs +index 0654ccc3..5edbd9cb 100644 +--- a/drivers/usb/xhcid/src/xhci/port.rs ++++ b/drivers/usb/xhcid/src/xhci/port.rs +@@ -46,6 +46,37 @@ bitflags! { + } + } + ++#[derive(Clone, Copy, Debug, Eq, PartialEq)] ++pub enum PortLinkState { ++ U0 = 0, ++ U1 = 1, ++ U2 = 2, ++ U3 = 3, ++} ++ ++impl PortLinkState { ++ const PORT_STATE_MASK: u32 = 0b1111 << 5; ++ ++ pub fn from_port_state(state: u8) -> Option { ++ match state { ++ 0 => Some(Self::U0), ++ 1 => Some(Self::U1), ++ 2 => Some(Self::U2), ++ 3 => Some(Self::U3), ++ _ => None, ++ } ++ } ++ ++ pub fn as_str(self) -> &'static str { ++ match self { ++ Self::U0 => "U0", ++ Self::U1 => "U1", ++ Self::U2 => "U2", ++ Self::U3 => "U3", ++ } ++ } ++} ++ + #[repr(C, packed)] + pub struct Port { + // This has write one to clear fields, do not expose it, handle writes carefully! +@@ -75,6 +106,14 @@ impl Port { + .write((self.flags_preserved() | PortFlags::PR).bits()); + } + ++ pub fn set_link_state(&mut self, state: PortLinkState) { ++ let mut value = self.flags_preserved().bits(); ++ value &= !PortLinkState::PORT_STATE_MASK; ++ value |= (state as u32) << 5; ++ value |= PortFlags::LWS.bits(); ++ self.portsc.write(value); ++ } ++ + pub fn state(&self) -> u8 { + ((self.read() & (0b1111 << 5)) >> 5) as u8 + } +diff --git a/drivers/usb/xhcid/src/xhci/scheme.rs b/drivers/usb/xhcid/src/xhci/scheme.rs +index ca27b3fe..2c27b906 100644 +--- a/drivers/usb/xhcid/src/xhci/scheme.rs ++++ b/drivers/usb/xhcid/src/xhci/scheme.rs +@@ -20,7 +20,9 @@ use std::convert::TryFrom; + use std::io::prelude::*; + use std::ops::Deref; + use std::sync::atomic; +-use std::{cmp, fmt, io, mem, str}; ++use std::collections::BTreeMap; ++use std::time::Duration; ++use std::{cmp, fmt, io, mem, ptr, str}; + + use common::dma::Dma; + use futures::executor::block_on; +@@ -557,6 +559,31 @@ impl AnyDescriptor { + } + + impl Xhci { ++ fn snapshot_input_context(input_context: &Dma>) -> Box<[u8]> { ++ let mut snapshot = vec![0u8; mem::size_of::>()].into_boxed_slice(); ++ unsafe { ++ ptr::copy_nonoverlapping( ++ (&**input_context) as *const super::InputContext as *const u8, ++ snapshot.as_mut_ptr(), ++ snapshot.len(), ++ ); ++ } ++ snapshot ++ } ++ ++ fn restore_input_context( ++ input_context: &mut Dma>, ++ snapshot: &[u8], ++ ) { ++ unsafe { ++ ptr::copy_nonoverlapping( ++ snapshot.as_ptr(), ++ (&mut **input_context) as *mut super::InputContext as *mut u8, ++ snapshot.len(), ++ ); ++ } ++ } ++ + async fn new_if_desc( + &self, + port_id: PortId, +@@ -629,6 +656,37 @@ impl Xhci { + + (event_trb, command_trb) + } ++ pub fn execute_command_with_timeout( ++ &self, ++ timeout: Duration, ++ f: F, ++ ) -> Option<(Trb, Trb)> { ++ if self.interrupt_is_pending(0) { ++ debug!("The EHB bit is already set!"); ++ } ++ ++ let pending_wait = { ++ let Ok(mut command_ring) = self.cmd.lock() else { ++ return None; ++ }; ++ let (cmd_index, cycle) = (command_ring.next_index(), command_ring.cycle); ++ ++ { ++ let command_trb = &mut command_ring.trbs[cmd_index]; ++ f(command_trb, cycle); ++ } ++ ++ let command_trb = &command_ring.trbs[cmd_index]; ++ self.queue_command_completion_wait( ++ &*command_ring, ++ command_trb, ++ EventDoorbell::new(self, 0, 0), ++ ) ++ }; ++ ++ let trbs = pending_wait.wait_timeout(timeout)?; ++ Some((trbs.event_trb, trbs.src_trb?)) ++ } + pub async fn execute_control_transfer( + &self, + port_num: PortId, +@@ -640,6 +698,9 @@ impl Xhci { + where + D: FnMut(&mut Trb, bool) -> ControlFlow, + { ++ self.wake_port_to_u0(port_num, self.slot(port_num)?); ++ let _transfer_guard = self.begin_transfer_guard(port_num)?; ++ + let future = { + let mut port_state = self.port_state_mut(port_num)?; + let slot = port_state.slot; +@@ -710,6 +771,9 @@ impl Xhci { + where + D: FnMut(&mut Trb, bool) -> ControlFlow, + { ++ self.wake_port_to_u0(port_num, self.slot(port_num)?); ++ let _transfer_guard = self.begin_transfer_guard(port_num)?; ++ + let endp_idx = endp_num.checked_sub(1).ok_or(Error::new(EIO))?; + let mut port_state = self.port_state_mut(port_num)?; + +@@ -863,6 +927,34 @@ impl Xhci { + handle_event_trb("RESET_ENDPOINT", &event_trb, &command_trb) + } + ++ pub async fn stop_endpoint_with_timeout( ++ &self, ++ port_num: PortId, ++ slot: u8, ++ endp_num: u8, ++ timeout: Duration, ++ ) -> Result { ++ let endp_num_xhc = if endp_num == 0 { ++ 1 ++ } else { ++ let endp_idx = endp_num.checked_sub(1).ok_or(Error::new(EIO))?; ++ let port_state = self.port_states.get(&port_num).ok_or(Error::new(EBADFD))?; ++ let endp_desc = port_state ++ .get_endp_desc(endp_idx) ++ .ok_or(Error::new(EBADFD))?; ++ Self::endp_num_to_dci(endp_num, endp_desc) ++ }; ++ ++ let Some((event_trb, command_trb)) = self.execute_command_with_timeout(timeout, |trb, cycle| { ++ trb.stop_endpoint(slot, endp_num_xhc, false, cycle); ++ }) else { ++ return Ok(false); ++ }; ++ ++ handle_event_trb("STOP_ENDPOINT", &event_trb, &command_trb)?; ++ Ok(true) ++ } ++ + fn endp_ctx_interval(speed_id: &ProtocolSpeed, endp_desc: &EndpDesc) -> u8 { + /// Logarithmic (base 2) 125 µs periods per millisecond. + const MILLISEC_PERIODS: u8 = 3; +@@ -956,9 +1048,7 @@ impl Xhci { + req: &ConfigureEndpointsReq, + ) -> Result<()> { + let (endp_desc_count, new_context_entries, configuration_value) = { +- let mut port_state = self.port_states.get_mut(&port).ok_or(Error::new(EBADFD))?; +- +- port_state.cfg_idx = Some(req.config_desc); ++ let port_state = self.port_states.get(&port).ok_or(Error::new(EBADFD))?; + + let config_desc = port_state + .dev_desc +@@ -1003,210 +1093,259 @@ impl Xhci { + Error::new(EIO) + })?; + +- { ++ let (slot, previous_cfg_idx, input_snapshot) = { + let port_state = self.port_states.get(&port).ok_or(Error::new(EBADFD))?; +- let mut input_context = port_state.input_context.lock().unwrap(); +- +- // Configure the slot context as well, which holds the last index of the endp descs. +- input_context.add_context.write(1); +- input_context.drop_context.write(0); +- +- const CONTEXT_ENTRIES_MASK: u32 = 0xF800_0000; +- const CONTEXT_ENTRIES_SHIFT: u8 = 27; ++ let input_snapshot = { ++ let input_context = port_state ++ .input_context ++ .lock() ++ .map_err(|_| Error::new(EIO))?; ++ Self::snapshot_input_context(&*input_context) ++ }; + +- const HUB_PORTS_MASK: u32 = 0xFF00_0000; +- const HUB_PORTS_SHIFT: u8 = 24; ++ (port_state.slot, port_state.cfg_idx, input_snapshot) ++ }; + +- let mut current_slot_a = input_context.device.slot.a.read(); +- let mut current_slot_b = input_context.device.slot.b.read(); ++ let mut staged_endpoint_states = BTreeMap::new(); ++ let stage_result = (|| -> Result<()> { ++ let mut port_state = self.port_states.get_mut(&port).ok_or(Error::new(EBADFD))?; + +- // Set context entries +- current_slot_a &= !CONTEXT_ENTRIES_MASK; +- current_slot_a |= +- (u32::from(new_context_entries) << CONTEXT_ENTRIES_SHIFT) & CONTEXT_ENTRIES_MASK; ++ { ++ let mut input_context = port_state ++ .input_context ++ .lock() ++ .map_err(|_| Error::new(EIO))?; + +- // Set hub data +- current_slot_a &= !(1 << 26); +- current_slot_b &= !HUB_PORTS_MASK; +- if let Some(hub_ports) = req.hub_ports { +- current_slot_a |= 1 << 26; +- current_slot_b |= (u32::from(hub_ports) << HUB_PORTS_SHIFT) & HUB_PORTS_MASK; +- } ++ // Configure the slot context as well, which holds the last index of the endp descs. ++ input_context.add_context.write(1); ++ input_context.drop_context.write(0); + +- input_context.device.slot.a.write(current_slot_a); +- input_context.device.slot.b.write(current_slot_b); ++ const CONTEXT_ENTRIES_MASK: u32 = 0xF800_0000; ++ const CONTEXT_ENTRIES_SHIFT: u8 = 27; + +- let control = if self.op.lock().unwrap().cie() { +- (u32::from(req.alternate_setting.unwrap_or(0)) << 16) +- | (u32::from(req.interface_desc.unwrap_or(0)) << 8) +- | u32::from(configuration_value) +- } else { +- 0 +- }; +- input_context.control.write(control); +- } ++ const HUB_PORTS_MASK: u32 = 0xFF00_0000; ++ const HUB_PORTS_SHIFT: u8 = 24; + +- for endp_idx in 0..endp_desc_count as u8 { +- let endp_num = endp_idx + 1; ++ let mut current_slot_a = input_context.device.slot.a.read(); ++ let mut current_slot_b = input_context.device.slot.b.read(); + +- let mut port_state = self.port_states.get_mut(&port).ok_or(Error::new(EBADFD))?; +- let dev_desc = port_state.dev_desc.as_ref().unwrap(); +- let endp_desc = port_state.get_endp_desc(endp_idx).ok_or_else(|| { +- warn!("failed to find endpoint {}", endp_idx); +- Error::new(EIO) +- })?; ++ current_slot_a &= !CONTEXT_ENTRIES_MASK; ++ current_slot_a |= (u32::from(new_context_entries) << CONTEXT_ENTRIES_SHIFT) ++ & CONTEXT_ENTRIES_MASK; + +- let endp_num_xhc = Self::endp_num_to_dci(endp_num, endp_desc); ++ current_slot_a &= !(1 << 26); ++ current_slot_b &= !HUB_PORTS_MASK; ++ if let Some(hub_ports) = req.hub_ports { ++ current_slot_a |= 1 << 26; ++ current_slot_b |= (u32::from(hub_ports) << HUB_PORTS_SHIFT) & HUB_PORTS_MASK; ++ } + +- let usb_log_max_streams = endp_desc.log_max_streams(); ++ input_context.device.slot.a.write(current_slot_a); ++ input_context.device.slot.b.write(current_slot_b); + +- // TODO: Secondary streams. +- let primary_streams = if let Some(log_max_streams) = usb_log_max_streams { +- // TODO: Can streams-capable be configured to not use streams? +- if log_max_psa_size != 0 { +- cmp::min(u8::from(log_max_streams), log_max_psa_size + 1) - 1 ++ let control = if self.op.lock().map_err(|_| Error::new(EIO))?.cie() { ++ (u32::from(req.alternate_setting.unwrap_or(0)) << 16) ++ | (u32::from(req.interface_desc.unwrap_or(0)) << 8) ++ | u32::from(configuration_value) + } else { + 0 +- } +- } else { +- 0 +- }; +- let linear_stream_array = if primary_streams != 0 { true } else { false }; ++ }; ++ input_context.control.write(control); ++ } + +- // TODO: Interval related fields +- // TODO: Max ESIT payload size. ++ for endp_idx in 0..endp_desc_count as u8 { ++ let endp_num = endp_idx + 1; + +- let mult = endp_desc.isoch_mult(lec); ++ let dev_desc = port_state.dev_desc.as_ref().ok_or(Error::new(EBADFD))?; ++ let endp_desc = port_state.get_endp_desc(endp_idx).ok_or_else(|| { ++ warn!("failed to find endpoint {}", endp_idx); ++ Error::new(EIO) ++ })?; + +- let max_packet_size = Self::endp_ctx_max_packet_size(endp_desc); +- let max_burst_size = Self::endp_ctx_max_burst(speed_id, dev_desc, endp_desc); ++ let endp_num_xhc = Self::endp_num_to_dci(endp_num, endp_desc); + +- let max_esit_payload = Self::endp_ctx_max_esit_payload( +- speed_id, +- dev_desc, +- endp_desc, +- max_packet_size, +- max_burst_size, +- ); +- let max_esit_payload_lo = max_esit_payload as u16; +- let max_esit_payload_hi = ((max_esit_payload & 0x00FF_0000) >> 16) as u8; +- +- let interval = Self::endp_ctx_interval(speed_id, endp_desc); +- +- let max_error_count = 3; +- let ep_ty = endp_desc.xhci_ep_type()?; +- let host_initiate_disable = false; +- +- // TODO: Maybe this value is out of scope for xhcid, because the actual usb device +- // driver probably knows better. The spec says that the initial value should be 8 bytes +- // for control, 1KiB for interrupt and 3KiB for bulk and isoch. +- let avg_trb_len: u16 = match endp_desc.ty() { +- EndpointTy::Ctrl => { +- warn!("trying to use control endpoint"); +- return Err(Error::new(EIO)); // only endpoint zero is of type control, and is configured separately with the address device command. +- } +- EndpointTy::Bulk | EndpointTy::Isoch => 3072, // 3 KiB +- EndpointTy::Interrupt => 1024, // 1 KiB +- }; ++ let usb_log_max_streams = endp_desc.log_max_streams(); ++ let primary_streams = if let Some(log_max_streams) = usb_log_max_streams { ++ if log_max_psa_size != 0 { ++ cmp::min(u8::from(log_max_streams), log_max_psa_size + 1) - 1 ++ } else { ++ 0 ++ } ++ } else { ++ 0 ++ }; ++ let linear_stream_array = primary_streams != 0; ++ ++ let mult = endp_desc.isoch_mult(lec); ++ let max_packet_size = Self::endp_ctx_max_packet_size(endp_desc); ++ let max_burst_size = Self::endp_ctx_max_burst(speed_id, dev_desc, endp_desc); ++ let max_esit_payload = Self::endp_ctx_max_esit_payload( ++ speed_id, ++ dev_desc, ++ endp_desc, ++ max_packet_size, ++ max_burst_size, ++ ); ++ let max_esit_payload_lo = max_esit_payload as u16; ++ let max_esit_payload_hi = ((max_esit_payload & 0x00FF_0000) >> 16) as u8; ++ let interval = Self::endp_ctx_interval(speed_id, endp_desc); ++ ++ let max_error_count = 3; ++ let ep_ty = endp_desc.xhci_ep_type()?; ++ let host_initiate_disable = false; ++ let avg_trb_len: u16 = match endp_desc.ty() { ++ EndpointTy::Ctrl => { ++ warn!("trying to use control endpoint"); ++ return Err(Error::new(EIO)); ++ } ++ EndpointTy::Bulk | EndpointTy::Isoch => 3072, ++ EndpointTy::Interrupt => 1024, ++ }; + +- assert_eq!(ep_ty & 0x7, ep_ty); +- assert_eq!(mult & 0x3, mult); +- assert_eq!(max_error_count & 0x3, max_error_count); +- assert_ne!(ep_ty, 0); // 0 means invalid. ++ assert_eq!(ep_ty & 0x7, ep_ty); ++ assert_eq!(mult & 0x3, mult); ++ assert_eq!(max_error_count & 0x3, max_error_count); ++ assert_ne!(ep_ty, 0); ++ ++ let (ring_ptr, staged_state) = if usb_log_max_streams.is_some() { ++ let mut array = ++ StreamContextArray::new::(self.cap.ac64(), 1 << (primary_streams + 1))?; ++ array.add_ring::(self.cap.ac64(), 1, true)?; ++ let array_ptr = array.register(); ++ ++ assert_eq!( ++ array_ptr & 0xFFFF_FFFF_FFFF_FF81, ++ array_ptr, ++ "stream ctx ptr not aligned to 16 bytes" ++ ); + +- let ring_ptr = if usb_log_max_streams.is_some() { +- let mut array = +- StreamContextArray::new::(self.cap.ac64(), 1 << (primary_streams + 1))?; ++ ( ++ array_ptr, ++ EndpointState { ++ transfer: super::RingOrStreams::Streams(array), ++ driver_if_state: EndpIfState::Init, ++ }, ++ ) ++ } else { ++ let ring = Ring::new::(self.cap.ac64(), 16, true)?; ++ let ring_ptr = ring.register(); + +- // TODO: Use as many stream rings as needed. +- array.add_ring::(self.cap.ac64(), 1, true)?; +- let array_ptr = array.register(); ++ assert_eq!( ++ ring_ptr & 0xFFFF_FFFF_FFFF_FF81, ++ ring_ptr, ++ "ring pointer not aligned to 16 bytes" ++ ); + +- assert_eq!( +- array_ptr & 0xFFFF_FFFF_FFFF_FF81, +- array_ptr, +- "stream ctx ptr not aligned to 16 bytes" ++ ( ++ ring_ptr, ++ EndpointState { ++ transfer: super::RingOrStreams::Ring(ring), ++ driver_if_state: EndpIfState::Init, ++ }, ++ ) ++ }; ++ assert_eq!(primary_streams & 0x1F, primary_streams); ++ ++ staged_endpoint_states.insert(endp_num, staged_state); ++ ++ let mut input_context = port_state ++ .input_context ++ .lock() ++ .map_err(|_| Error::new(EIO))?; ++ input_context.add_context.writef(1 << endp_num_xhc, true); ++ ++ let endp_i = endp_num_xhc as usize - 1; ++ input_context.device.endpoints[endp_i].a.write( ++ u32::from(mult) << 8 ++ | u32::from(primary_streams) << 10 ++ | u32::from(linear_stream_array) << 15 ++ | u32::from(interval) << 16 ++ | u32::from(max_esit_payload_hi) << 24, + ); +- port_state.endpoint_states.insert( +- endp_num, +- EndpointState { +- transfer: super::RingOrStreams::Streams(array), +- driver_if_state: EndpIfState::Init, +- }, ++ input_context.device.endpoints[endp_i].b.write( ++ max_error_count << 1 ++ | u32::from(ep_ty) << 3 ++ | u32::from(host_initiate_disable) << 7 ++ | u32::from(max_burst_size) << 8 ++ | u32::from(max_packet_size) << 16, + ); + +- array_ptr +- } else { +- let ring = Ring::new::(self.cap.ac64(), 16, true)?; +- let ring_ptr = ring.register(); +- +- assert_eq!( +- ring_ptr & 0xFFFF_FFFF_FFFF_FF81, +- ring_ptr, +- "ring pointer not aligned to 16 bytes" +- ); +- port_state.endpoint_states.insert( +- endp_num, +- EndpointState { +- transfer: super::RingOrStreams::Ring(ring), +- driver_if_state: EndpIfState::Init, +- }, +- ); +- ring_ptr +- }; +- assert_eq!(primary_streams & 0x1F, primary_streams); +- +- let mut input_context = port_state.input_context.lock().unwrap(); +- input_context.add_context.writef(1 << endp_num_xhc, true); +- +- let endp_i = endp_num_xhc as usize - 1; +- input_context.device.endpoints[endp_i].a.write( +- u32::from(mult) << 8 +- | u32::from(primary_streams) << 10 +- | u32::from(linear_stream_array) << 15 +- | u32::from(interval) << 16 +- | u32::from(max_esit_payload_hi) << 24, +- ); +- input_context.device.endpoints[endp_i].b.write( +- max_error_count << 1 +- | u32::from(ep_ty) << 3 +- | u32::from(host_initiate_disable) << 7 +- | u32::from(max_burst_size) << 8 +- | u32::from(max_packet_size) << 16, +- ); ++ input_context.device.endpoints[endp_i].trl.write(ring_ptr as u32); ++ input_context.device.endpoints[endp_i].trh.write((ring_ptr >> 32) as u32); ++ input_context.device.endpoints[endp_i] ++ .c ++ .write(u32::from(avg_trb_len) | (u32::from(max_esit_payload_lo) << 16)); + +- input_context.device.endpoints[endp_i] +- .trl +- .write(ring_ptr as u32); +- input_context.device.endpoints[endp_i] +- .trh +- .write((ring_ptr >> 32) as u32); ++ log::debug!("initialized endpoint {}", endp_num); ++ } + +- input_context.device.endpoints[endp_i] +- .c +- .write(u32::from(avg_trb_len) | (u32::from(max_esit_payload_lo) << 16)); ++ Ok(()) ++ })(); + +- log::debug!("initialized endpoint {}", endp_num); ++ if let Err(err) = stage_result { ++ warn!("xhcid: configure slot {} failed, rolling back", slot); ++ let mut port_state = self.port_states.get_mut(&port).ok_or(Error::new(EBADFD))?; ++ let mut input_context = port_state ++ .input_context ++ .lock() ++ .map_err(|_| Error::new(EIO))?; ++ Self::restore_input_context(&mut *input_context, &input_snapshot); ++ return Err(err); + } + +- { +- let port_state = self.port_states.get(&port).ok_or(Error::new(EBADFD))?; +- let slot = port_state.slot; +- let input_context_physical = port_state.input_context.lock().unwrap().physical(); ++ let input_context_physical = self ++ .port_states ++ .get(&port) ++ .ok_or(Error::new(EBADFD))? ++ .input_context ++ .lock() ++ .map_err(|_| Error::new(EIO))? ++ .physical(); ++ ++ let (event_trb, command_trb) = self ++ .execute_command(|trb, cycle| trb.configure_endpoint(slot, input_context_physical, cycle)) ++ .await; ++ ++ if let Err(err) = handle_event_trb("CONFIGURE_ENDPOINT", &event_trb, &command_trb) { ++ warn!("xhcid: configure slot {} failed, rolling back", slot); ++ let mut port_state = self.port_states.get_mut(&port).ok_or(Error::new(EBADFD))?; ++ let mut input_context = port_state ++ .input_context ++ .lock() ++ .map_err(|_| Error::new(EIO))?; ++ Self::restore_input_context(&mut *input_context, &input_snapshot); ++ return Err(err); ++ } + +- let (event_trb, command_trb) = self +- .execute_command(|trb, cycle| { +- trb.configure_endpoint(slot, input_context_physical, cycle) +- }) +- .await; ++ if let Err(err) = self.set_configuration(port, configuration_value).await { ++ warn!("xhcid: configure slot {} failed, rolling back", slot); ++ { ++ let mut port_state = self.port_states.get_mut(&port).ok_or(Error::new(EBADFD))?; ++ let mut input_context = port_state ++ .input_context ++ .lock() ++ .map_err(|_| Error::new(EIO))?; ++ Self::restore_input_context(&mut *input_context, &input_snapshot); ++ } + +- //self.event_handler_finished(); ++ if let Err(restore_err) = self.set_configuration(port, previous_cfg_idx.unwrap_or(0)).await { ++ warn!( ++ "xhcid: failed to restore configuration {} for slot {}: {}", ++ previous_cfg_idx.unwrap_or(0), ++ slot, ++ restore_err ++ ); ++ } + +- handle_event_trb("CONFIGURE_ENDPOINT", &event_trb, &command_trb)?; ++ return Err(err); + } + +- // Tell the device about this configuration. +- self.set_configuration(port, configuration_value).await?; ++ let mut port_state = self.port_states.get_mut(&port).ok_or(Error::new(EBADFD))?; ++ port_state.cfg_idx = Some(req.config_desc); ++ port_state.endpoint_states.retain(|endp_num, _| *endp_num == 0); ++ for (endp_num, state) in staged_endpoint_states { ++ port_state.endpoint_states.insert(endp_num, state); ++ } + + Ok(()) + } diff --git a/local/patches/kernel/P4-s3-suspend-resume.patch b/local/patches/kernel/P4-s3-suspend-resume.patch new file mode 100644 index 00000000..9b223b31 --- /dev/null +++ b/local/patches/kernel/P4-s3-suspend-resume.patch @@ -0,0 +1,1084 @@ +diff --git a/Cargo.toml b/Cargo.toml +index 6d4f059..e05f723 100644 +--- a/Cargo.toml ++++ b/Cargo.toml +@@ -12,6 +12,7 @@ cc = "1.0" + toml = "0.8" + + [dependencies] ++acpi_ext = { package = "acpi", git = "https://gitlab.redox-os.org/redox-os/acpi.git", branch = "redox-6.x" } + arrayvec = { version = "0.7.4", default-features = false } + bitfield = "0.13.2" + bitflags = "2" +diff --git a/build.rs b/build.rs +index 96c3ea5..751746c 100644 +--- a/build.rs ++++ b/build.rs +@@ -77,6 +77,7 @@ fn main() { + } + "x86_64" => { + println!("cargo::rerun-if-changed=src/asm/x86_64/trampoline.asm"); ++ println!("cargo::rerun-if-changed=src/asm/x86_64/s3_wakeup.asm"); + + let status = Command::new("nasm") + .arg("-f") +@@ -89,6 +90,18 @@ fn main() { + if !status.success() { + panic!("nasm failed with exit status {}", status); + } ++ ++ let status = Command::new("nasm") ++ .arg("-f") ++ .arg("bin") ++ .arg("-o") ++ .arg(format!("{}/s3_wakeup", out_dir)) ++ .arg("src/asm/x86_64/s3_wakeup.asm") ++ .status() ++ .expect("failed to run nasm"); ++ if !status.success() { ++ panic!("nasm failed with exit status {}", status); ++ } + } + "riscv64" => { + println!("cargo::rustc-cfg=dtb"); +diff --git a/src/acpi/mod.rs b/src/acpi/mod.rs +index 59e3526..b3b80f0 100644 +--- a/src/acpi/mod.rs ++++ b/src/acpi/mod.rs +@@ -82,6 +82,14 @@ impl Rxsdt for RxsdtEnum { + + pub static RXSDT_ENUM: Once = Once::new(); + ++#[derive(Clone, Copy, Debug)] ++pub struct AcpiRootInfo { ++ pub revision: u8, ++ pub root_sdt_address: PhysicalAddress, ++} ++ ++pub static ACPI_ROOT_INFO: Once = Once::new(); ++ + /// Parse the ACPI tables to gather CPU, interrupt, and timer information + pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) { + unsafe { +@@ -94,6 +102,15 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) { + let rsdp_opt = Rsdp::get_rsdp(already_supplied_rsdp); + + if let Some(rsdp) = rsdp_opt { ++ let root_info = ACPI_ROOT_INFO.call_once(|| AcpiRootInfo { ++ revision: rsdp.revision(), ++ root_sdt_address: rsdp.sdt_address(), ++ }); ++ ++ if root_info.root_sdt_address != rsdp.sdt_address() || root_info.revision != rsdp.revision() { ++ error!("ACPI_ROOT_INFO already initialized with a different RSDP root"); ++ } ++ + debug!("SDT address: {:#x}", rsdp.sdt_address().data()); + let rxsdt = get_sdt(rsdp.sdt_address(), &mut KernelMapper::lock_rw()); + +diff --git a/src/acpi/rsdp.rs b/src/acpi/rsdp.rs +index f10c5ac..5e93a9f 100644 +--- a/src/acpi/rsdp.rs ++++ b/src/acpi/rsdp.rs +@@ -31,4 +31,8 @@ impl Rsdp { + self.rsdt_address as usize + }) + } ++ ++ pub fn revision(&self) -> u8 { ++ self.revision ++ } + } +diff --git a/src/arch/x86_shared/mod.rs b/src/arch/x86_shared/mod.rs +index e3c3050..11c33e9 100644 +--- a/src/arch/x86_shared/mod.rs ++++ b/src/arch/x86_shared/mod.rs +@@ -28,6 +28,8 @@ pub mod pti; + /// Initialization and start function + pub mod start; + ++pub mod sleep; ++ + /// Stop function + pub mod stop; + +diff --git a/src/scheme/acpi.rs b/src/scheme/acpi.rs +index 87570a1..5d73469 100644 +--- a/src/scheme/acpi.rs ++++ b/src/scheme/acpi.rs +@@ -10,6 +10,7 @@ use syscall::{ + + use crate::{ + acpi::{RxsdtEnum, RXSDT_ENUM}, ++ arch::sleep, + context::file::InternalFlags, + event, + sync::{CleanLockToken, RwLock, WaitCondition, L1}, +@@ -40,6 +41,7 @@ enum HandleKind { + TopLevel, + Rxsdt, + ShutdownPipe, ++ SleepControl, + SchemeRoot, + } + +@@ -146,11 +148,11 @@ impl KernelScheme for AcpiScheme { + if flags & O_EXCL == O_EXCL || flags & O_SYMLINK == O_SYMLINK { + return Err(Error::new(EINVAL)); + } +- if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { +- return Err(Error::new(EROFS)); +- } + let (handle_kind, int_flags) = match path { + "" => { ++ if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { ++ return Err(Error::new(EROFS)); ++ } + if flags & O_DIRECTORY != O_DIRECTORY && flags & O_STAT != O_STAT { + return Err(Error::new(EISDIR)); + } +@@ -158,17 +160,36 @@ impl KernelScheme for AcpiScheme { + (HandleKind::TopLevel, InternalFlags::POSITIONED) + } + "rxsdt" => { ++ if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { ++ return Err(Error::new(EROFS)); ++ } + if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { + return Err(Error::new(ENOTDIR)); + } + (HandleKind::Rxsdt, InternalFlags::POSITIONED) + } + "kstop" => { ++ if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { ++ return Err(Error::new(EROFS)); ++ } + if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { + return Err(Error::new(ENOTDIR)); + } + (HandleKind::ShutdownPipe, InternalFlags::empty()) + } ++ "sleep" => { ++ if flags & O_ACCMODE == O_RDONLY || flags & O_STAT == O_STAT { ++ // allowed ++ } else if flags & O_ACCMODE != syscall::flag::O_WRONLY ++ && flags & O_ACCMODE != syscall::flag::O_RDWR ++ { ++ return Err(Error::new(EINVAL)); ++ } ++ if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { ++ return Err(Error::new(ENOTDIR)); ++ } ++ (HandleKind::SleepControl, InternalFlags::POSITIONED) ++ } + _ => return Err(Error::new(ENOENT)), + }; + +@@ -191,6 +212,7 @@ impl KernelScheme for AcpiScheme { + Ok(match handle.kind { + HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?.len() as u64, + HandleKind::ShutdownPipe => 1, ++ HandleKind::SleepControl => sleep::available_sleep_states().len() as u64, + HandleKind::TopLevel => 0, + HandleKind::SchemeRoot => return Err(Error::new(EBADF))?, + }) +@@ -253,6 +275,7 @@ impl KernelScheme for AcpiScheme { + + return dst_buf.copy_exactly(&[0x42]).map(|()| 1); + } ++ HandleKind::SleepControl => sleep::available_sleep_states(), + HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?, + HandleKind::TopLevel => return Err(Error::new(EISDIR)), + HandleKind::SchemeRoot => return Err(Error::new(EBADF)), +@@ -295,11 +318,45 @@ impl KernelScheme for AcpiScheme { + kind: DirentKind::Socket, + name: "kstop", + inode: 0, ++ next_opaque_id: 2, ++ })?; ++ } ++ if opaque <= 2 { ++ buf.entry(DirEntry { ++ kind: DirentKind::Regular, ++ name: "sleep", ++ inode: 0, + next_opaque_id: u64::MAX, + })?; + } + Ok(buf.finalize()) + } ++ fn kwrite( ++ &self, ++ id: usize, ++ buf: crate::syscall::usercopy::UserSliceRo, ++ _flags: u32, ++ _stored_flags: u32, ++ token: &mut CleanLockToken, ++ ) -> Result { ++ let handle = *HANDLES.read(token.token()).get(id)?; ++ ++ if handle.stat { ++ return Err(Error::new(EBADF)); ++ } ++ ++ match handle.kind { ++ HandleKind::SleepControl => { ++ let mut tmp = [0_u8; 16]; ++ let len = buf.copy_common_bytes_to_slice(&mut tmp)?; ++ let request = core::str::from_utf8(&tmp[..len]).map_err(|_| Error::new(EINVAL))?; ++ sleep::trigger_sleep_request(request)?; ++ Ok(len) ++ } ++ HandleKind::SchemeRoot => Err(Error::new(EBADF)), ++ _ => Err(Error::new(EBADF)), ++ } ++ } + fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { + //TODO: construct useful path? + buf.copy_common_bytes_from_slice("/scheme/kernel.acpi/".as_bytes()) +@@ -328,6 +385,11 @@ impl KernelScheme for AcpiScheme { + st_size: 1, + ..Default::default() + }, ++ HandleKind::SleepControl => Stat { ++ st_mode: MODE_FILE, ++ st_size: sleep::available_sleep_states().len().try_into().unwrap_or(u64::MAX), ++ ..Default::default() ++ }, + HandleKind::SchemeRoot => return Err(Error::new(EBADF)), + })?; + +diff --git a/src/arch/x86_shared/sleep.rs b/src/arch/x86_shared/sleep.rs +new file mode 100644 +index 0000000..9f98c0d +--- /dev/null ++++ b/src/arch/x86_shared/sleep.rs +@@ -0,0 +1,712 @@ ++use alloc::{sync::Arc, vec::Vec}; ++use core::{ ++ ptr::NonNull, ++ str::FromStr, ++ sync::atomic::{AtomicU32, Ordering}, ++}; ++ ++use acpi_ext::{ ++ aml::{namespace::AmlName, object::Object, Interpreter}, ++ registers::FixedRegisters, ++ sdt::{facs::Facs, fadt::Fadt, SdtHeader}, ++ AcpiTables, Handle, Handler, PhysicalMapping, ++}; ++use spin::Mutex; ++use syscall::error::{Error, EINVAL, EIO}; ++use x86::{segmentation::SegmentSelector, task, Ring}; ++ ++use crate::{ ++ acpi::ACPI_ROOT_INFO, ++ arch::interrupt, ++ memory::{ ++ round_down_pages, round_up_pages, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA, ++ RmmArch, VirtualAddress, PAGE_SIZE, ++ }, ++ syscall::io::{Io, Pio}, ++}; ++ ++const ACPI_SLP_TYP_SHIFT: u16 = 10; ++const ACPI_SLP_TYP_MASK: u16 = 0x1C00; ++const ACPI_SLP_EN: u16 = 1 << 13; ++const WAKE_TRAMPOLINE_PHYS: usize = 0x8000; ++const SLEEP_RETURN_OK: usize = 0; ++ ++#[cfg(target_arch = "x86_64")] ++static WAKE_TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/s3_wakeup")); ++ ++#[repr(C, packed)] ++#[derive(Clone, Copy, Debug, Default)] ++struct DescriptorTableRegister { ++ limit: u16, ++ base: u64, ++} ++ ++#[repr(C, align(64))] ++#[derive(Clone, Copy, Debug)] ++struct FpuState { ++ bytes: [u8; 4096], ++} ++ ++impl Default for FpuState { ++ fn default() -> Self { ++ Self { bytes: [0; 4096] } ++ } ++} ++ ++#[derive(Clone, Copy, Debug, Eq, PartialEq)] ++pub enum SleepState { ++ S3, ++ S5, ++} ++ ++#[derive(Clone, Copy, Debug, Eq, PartialEq)] ++pub enum SleepError { ++ UnsupportedArch, ++ MissingAcpi, ++ MissingFadt, ++ MissingFacs, ++ MissingSleepObject, ++ InvalidSleepObject, ++ UnsupportedPmControl, ++ UnsupportedAmlOperation, ++ SleepDidNotEnter, ++} ++ ++impl SleepError { ++ fn code(self) -> usize { ++ match self { ++ Self::UnsupportedArch => EINVAL as usize, ++ Self::MissingAcpi ++ | Self::MissingFadt ++ | Self::MissingFacs ++ | Self::MissingSleepObject ++ | Self::UnsupportedAmlOperation => EIO as usize, ++ Self::InvalidSleepObject | Self::UnsupportedPmControl | Self::SleepDidNotEnter => { ++ EINVAL as usize ++ } ++ } ++ } ++ ++ fn from_code(code: usize) -> Self { ++ match code as i32 { ++ x if x == EINVAL => Self::InvalidSleepObject, ++ _ => Self::MissingAcpi, ++ } ++ } ++} ++ ++#[derive(Clone, Copy, Debug, Default)] ++struct SavedCpuContext { ++ entry_rsp: usize, ++ runtime_rsp: usize, ++ facs_address: usize, ++ cr0: usize, ++ cr2: usize, ++ cr3: usize, ++ cr4: usize, ++ rflags: usize, ++ gdtr: DescriptorTableRegister, ++ idtr: DescriptorTableRegister, ++ efer: u64, ++ fs_base: u64, ++ gs_base: u64, ++ kernel_gs_base: u64, ++ fpu: FpuState, ++} ++ ++static SAVED_CONTEXT: Mutex> = Mutex::new(None); ++static AML_MUTEX_IDS: AtomicU32 = AtomicU32::new(1); ++ ++#[derive(Clone, Copy, Debug)] ++struct SleepTypeData { ++ a: u16, ++ b: u16, ++} ++ ++#[derive(Clone, Copy)] ++struct KernelAcpiHandler; ++ ++impl KernelAcpiHandler { ++ fn map_range(physical_address: usize, size: usize) -> (*mut u8, usize) { ++ let map_base = round_down_pages(physical_address); ++ let map_offset = physical_address - map_base; ++ let mapped_length = round_up_pages(size + map_offset); ++ ++ // SAFETY: The ACPI interpreter only requests firmware-described physical regions. ++ unsafe { ++ let mut mapper = KernelMapper::lock_rw(); ++ for page_index in 0..mapped_length / PAGE_SIZE { ++ let (_, flush) = mapper ++ .map_linearly( ++ PhysicalAddress::new(map_base + page_index * PAGE_SIZE), ++ PageFlags::new(), ++ ) ++ .expect("failed to linearly map ACPI physical region"); ++ flush.flush(); ++ } ++ } ++ ++ let virtual_base = RmmA::phys_to_virt(PhysicalAddress::new(map_base)).data(); ++ ((virtual_base + map_offset) as *mut u8, mapped_length) ++ } ++} ++ ++impl Handler for KernelAcpiHandler { ++ unsafe fn map_physical_region(&self, physical_address: usize, size: usize) -> PhysicalMapping { ++ let (virtual_start, mapped_length) = Self::map_range(physical_address, size); ++ PhysicalMapping { ++ physical_start: physical_address, ++ virtual_start: NonNull::new(virtual_start.cast::()) ++ .expect("expected mapped ACPI virtual address to be non-null"), ++ region_length: size, ++ mapped_length, ++ handler: *self, ++ } ++ } ++ ++ fn unmap_physical_region(_region: &PhysicalMapping) {} ++ ++ fn read_u8(&self, address: usize) -> u8 { ++ // SAFETY: AML system-memory accesses are byte-addressable firmware regions. ++ unsafe { core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u8) } ++ } ++ ++ fn read_u16(&self, address: usize) -> u16 { ++ // SAFETY: AML system-memory accesses are word-addressable firmware regions. ++ unsafe { ++ core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u16) ++ } ++ } ++ ++ fn read_u32(&self, address: usize) -> u32 { ++ // SAFETY: AML system-memory accesses are dword-addressable firmware regions. ++ unsafe { ++ core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u32) ++ } ++ } ++ ++ fn read_u64(&self, address: usize) -> u64 { ++ // SAFETY: AML system-memory accesses are qword-addressable firmware regions. ++ unsafe { ++ core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u64) ++ } ++ } ++ ++ fn write_u8(&self, address: usize, value: u8) { ++ // SAFETY: AML system-memory accesses are byte-addressable firmware regions. ++ unsafe { ++ core::ptr::write_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u8, value) ++ } ++ } ++ ++ fn write_u16(&self, address: usize, value: u16) { ++ // SAFETY: AML system-memory accesses are word-addressable firmware regions. ++ unsafe { ++ core::ptr::write_volatile( ++ RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u16, ++ value, ++ ) ++ } ++ } ++ ++ fn write_u32(&self, address: usize, value: u32) { ++ // SAFETY: AML system-memory accesses are dword-addressable firmware regions. ++ unsafe { ++ core::ptr::write_volatile( ++ RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u32, ++ value, ++ ) ++ } ++ } ++ ++ fn write_u64(&self, address: usize, value: u64) { ++ // SAFETY: AML system-memory accesses are qword-addressable firmware regions. ++ unsafe { ++ core::ptr::write_volatile( ++ RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u64, ++ value, ++ ) ++ } ++ } ++ ++ fn read_io_u8(&self, port: u16) -> u8 { ++ Pio::::new(port).read() ++ } ++ ++ fn read_io_u16(&self, port: u16) -> u16 { ++ Pio::::new(port).read() ++ } ++ ++ fn read_io_u32(&self, port: u16) -> u32 { ++ Pio::::new(port).read() ++ } ++ ++ fn write_io_u8(&self, port: u16, value: u8) { ++ Pio::::new(port).write(value) ++ } ++ ++ fn write_io_u16(&self, port: u16, value: u16) { ++ Pio::::new(port).write(value) ++ } ++ ++ fn write_io_u32(&self, port: u16, value: u32) { ++ Pio::::new(port).write(value) ++ } ++ ++ fn read_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u8 { ++ 0 ++ } ++ ++ fn read_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u16 { ++ 0 ++ } ++ ++ fn read_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u32 { ++ 0 ++ } ++ ++ fn write_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u8) {} ++ ++ fn write_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u16) {} ++ ++ fn write_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u32) {} ++ ++ fn nanos_since_boot(&self) -> u64 { ++ 0 ++ } ++ ++ fn stall(&self, microseconds: u64) { ++ for _ in 0..(microseconds.saturating_mul(64)) { ++ core::hint::spin_loop(); ++ } ++ } ++ ++ fn sleep(&self, milliseconds: u64) { ++ for _ in 0..(milliseconds.saturating_mul(64_000)) { ++ core::hint::spin_loop(); ++ } ++ } ++ ++ fn create_mutex(&self) -> Handle { ++ Handle(AML_MUTEX_IDS.fetch_add(1, Ordering::Relaxed)) ++ } ++ ++ fn acquire(&self, _mutex: Handle, _timeout: u16) -> Result<(), acpi_ext::aml::AmlError> { ++ Ok(()) ++ } ++ ++ fn release(&self, _mutex: Handle) {} ++} ++ ++fn sleep_state_name(state: SleepState) -> &'static str { ++ match state { ++ SleepState::S3 => "\\_S3", ++ SleepState::S5 => "\\_S5", ++ } ++} ++ ++fn encode_sleep_type(value: u16) -> u16 { ++ if value <= 0x7 { ++ value << ACPI_SLP_TYP_SHIFT ++ } else { ++ value & ACPI_SLP_TYP_MASK ++ } ++} ++ ++fn load_interpreter() -> Result<( ++ Arc>, ++ PhysicalMapping, ++ Interpreter, ++), SleepError> { ++ let root = *ACPI_ROOT_INFO.get().ok_or(SleepError::MissingAcpi)?; ++ let handler = KernelAcpiHandler; ++ ++ // SAFETY: ACPI root info is captured from the firmware-provided, already validated root table. ++ let tables = unsafe { ++ AcpiTables::from_rsdt(handler, root.revision, root.root_sdt_address.data()) ++ .map_err(|_| SleepError::MissingAcpi)? ++ }; ++ let fadt = tables.find_table::().ok_or(SleepError::MissingFadt)?; ++ let registers = Arc::new( ++ FixedRegisters::new(&fadt, handler).map_err(|_| SleepError::UnsupportedPmControl)?, ++ ); ++ let facs_address = fadt.facs_address().map_err(|_| SleepError::MissingFacs)?; ++ ++ // SAFETY: The FADT-supplied FACS address is used exactly as described by the ACPI spec. ++ let facs = unsafe { handler.map_physical_region::(facs_address, core::mem::size_of::()) }; ++ // SAFETY: The AML interpreter only needs an owned mapping of the same firmware FACS table. ++ let interpreter_facs = unsafe { ++ handler.map_physical_region::(facs_address, core::mem::size_of::()) ++ }; ++ let dsdt = tables.dsdt().map_err(|_| SleepError::MissingFadt)?; ++ let interpreter = Interpreter::new(handler, dsdt.revision, Arc::clone(®isters), Some(interpreter_facs)); ++ ++ // SAFETY: Each AML table mapping is owned by the interpreter during table loading. ++ unsafe { ++ let mapping = handler.map_physical_region::(dsdt.phys_address, dsdt.length as usize); ++ let stream = core::slice::from_raw_parts( ++ mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::()) as *const u8, ++ dsdt.length as usize - core::mem::size_of::(), ++ ); ++ interpreter ++ .load_table(stream) ++ .map_err(|_| SleepError::UnsupportedAmlOperation)?; ++ ++ for ssdt in tables.ssdts() { ++ let mapping = handler.map_physical_region::(ssdt.phys_address, ssdt.length as usize); ++ let stream = core::slice::from_raw_parts( ++ mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::()) as *const u8, ++ ssdt.length as usize - core::mem::size_of::(), ++ ); ++ interpreter ++ .load_table(stream) ++ .map_err(|_| SleepError::UnsupportedAmlOperation)?; ++ } ++ } ++ ++ Ok((registers, facs, interpreter)) ++} ++ ++fn sleep_type_data_from_interpreter( ++ interpreter: &Interpreter, ++ state: SleepState, ++) -> Result { ++ let name = AmlName::from_str(sleep_state_name(state)).map_err(|_| SleepError::MissingSleepObject)?; ++ let object = interpreter ++ .evaluate(name, Vec::new()) ++ .map_err(|_| SleepError::MissingSleepObject)?; ++ ++ let Object::Package(package) = &*object else { ++ return Err(SleepError::InvalidSleepObject); ++ }; ++ ++ let Some(typa_object) = package.first() else { ++ return Err(SleepError::InvalidSleepObject); ++ }; ++ let Some(typb_object) = package.get(1) else { ++ return Err(SleepError::InvalidSleepObject); ++ }; ++ ++ let Object::Integer(typa) = &**typa_object else { ++ return Err(SleepError::InvalidSleepObject); ++ }; ++ let Object::Integer(typb) = &**typb_object else { ++ return Err(SleepError::InvalidSleepObject); ++ }; ++ ++ Ok(SleepTypeData { ++ a: encode_sleep_type(*typa as u16), ++ b: encode_sleep_type(*typb as u16), ++ }) ++} ++ ++fn sleep_type_data(state: SleepState) -> Result { ++ let (_registers, _facs, interpreter) = load_interpreter()?; ++ sleep_type_data_from_interpreter(&interpreter, state) ++} ++ ++fn install_wake_trampoline(stack_rsp: usize, cr3: usize) { ++ let trampoline_page = Page::containing_address(VirtualAddress::new(WAKE_TRAMPOLINE_PHYS)); ++ let trampoline_frame = PhysicalAddress::new(WAKE_TRAMPOLINE_PHYS); ++ ++ // SAFETY: The 0x8000 low-memory trampoline page is reserved by the kernel for bootstrap stubs. ++ let (result, _) = unsafe { ++ let mut mapper = KernelMapper::lock_rw(); ++ let result = mapper ++ .map_phys( ++ trampoline_page.start_address(), ++ trampoline_frame, ++ PageFlags::new().execute(true).write(true), ++ ) ++ .expect("failed to map S3 wake trampoline page"); ++ (result, mapper.table().phys().data()) ++ }; ++ result.flush(); ++ ++ for (index, value) in WAKE_TRAMPOLINE_DATA.iter().enumerate() { ++ // SAFETY: The trampoline page is mapped writable at the same virtual address as the physical page. ++ unsafe { ++ core::ptr::write_volatile((WAKE_TRAMPOLINE_PHYS as *mut u8).add(index), *value); ++ } ++ } ++ ++ // SAFETY: The wake trampoline layout reserves three qword fields immediately after the jump. ++ unsafe { ++ let stack_slot = (WAKE_TRAMPOLINE_PHYS + 8) as *mut u64; ++ let page_table_slot = stack_slot.add(1); ++ let code_slot = stack_slot.add(2); ++ stack_slot.write(stack_rsp as u64); ++ page_table_slot.write(cr3 as u64); ++ #[expect(clippy::fn_to_numeric_cast)] ++ code_slot.write(resume_from_s3_trampoline as usize as u64); ++ } ++ ++ // SAFETY: The trampoline mapping is no longer needed once the physical page has been populated. ++ let (_frame, _, flush) = unsafe { ++ KernelMapper::lock_rw() ++ .unmap_phys(trampoline_page.start_address()) ++ .expect("failed to unmap S3 wake trampoline page") ++ }; ++ flush.flush(); ++} ++ ++fn save_descriptor_tables(context: &mut SavedCpuContext) { ++ // SAFETY: SGDT/SIDT only read the current CPU descriptor-table registers into the provided storage. ++ unsafe { ++ core::arch::asm!("sgdt [{}]", in(reg) &mut context.gdtr, options(nostack, preserves_flags)); ++ core::arch::asm!("sidt [{}]", in(reg) &mut context.idtr, options(nostack, preserves_flags)); ++ } ++} ++ ++fn save_fpu_state(context: &mut SavedCpuContext) { ++ // SAFETY: The kernel owns the current CPU at suspend entry and the FXSAVE buffer is 64-byte aligned. ++ unsafe { ++ core::arch::asm!( ++ "fxsave64 [{}]", ++ in(reg) context.fpu.bytes.as_mut_ptr(), ++ ); ++ } ++} ++ ++fn restore_fpu_state(context: &SavedCpuContext) { ++ // SAFETY: The saved FXSAVE image belongs to the same CPU context and matches the restore instruction. ++ unsafe { ++ core::arch::asm!( ++ "fxrstor64 [{}]", ++ in(reg) context.fpu.bytes.as_ptr(), ++ ); ++ } ++} ++ ++fn save_cpu_context(entry_rsp: usize) -> SavedCpuContext { ++ let mut context = SavedCpuContext { ++ entry_rsp, ++ ..SavedCpuContext::default() ++ }; ++ ++ // SAFETY: Reading control registers and MSRs is required to reconstruct the CPU execution state on wake. ++ unsafe { ++ core::arch::asm!( ++ "mov {}, cr0", ++ out(reg) context.cr0, ++ options(nostack, preserves_flags) ++ ); ++ core::arch::asm!( ++ "mov {}, cr2", ++ out(reg) context.cr2, ++ options(nostack, preserves_flags) ++ ); ++ core::arch::asm!( ++ "mov {}, cr3", ++ out(reg) context.cr3, ++ options(nostack, preserves_flags) ++ ); ++ core::arch::asm!( ++ "mov {}, cr4", ++ out(reg) context.cr4, ++ options(nostack, preserves_flags) ++ ); ++ core::arch::asm!( ++ "pushfq", ++ "pop {}", ++ out(reg) context.rflags, ++ options(preserves_flags) ++ ); ++ core::arch::asm!("mov {}, rsp", out(reg) context.runtime_rsp, options(nostack, preserves_flags)); ++ ++ context.efer = x86::msr::rdmsr(x86::msr::IA32_EFER); ++ context.fs_base = x86::msr::rdmsr(x86::msr::IA32_FS_BASE); ++ context.gs_base = x86::msr::rdmsr(x86::msr::IA32_GS_BASE); ++ context.kernel_gs_base = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE); ++ } ++ ++ save_descriptor_tables(&mut context); ++ save_fpu_state(&mut context); ++ context ++} ++ ++fn set_firmware_waking_vector(facs: &mut PhysicalMapping, vector: usize) { ++ facs.firmware_waking_vector = vector as u32; ++ facs.x_firmware_waking_vector = vector as u64; ++} ++ ++fn write_pm1_control_block( ++ registers: &FixedRegisters, ++ sleep_type: SleepTypeData, ++) -> Result<(), SleepError> { ++ let current_a = registers ++ .pm1_control_registers ++ .pm1a ++ .read() ++ .map_err(|_| SleepError::UnsupportedPmControl)? as u16; ++ let armed_a = (current_a & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.a; ++ ++ registers ++ .pm1_control_registers ++ .pm1a ++ .write(u64::from(armed_a)) ++ .map_err(|_| SleepError::UnsupportedPmControl)?; ++ ++ if let Some(pm1b) = ®isters.pm1_control_registers.pm1b { ++ let current_b = pm1b.read().map_err(|_| SleepError::UnsupportedPmControl)? as u16; ++ let armed_b = (current_b & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.b; ++ pm1b.write(u64::from(armed_b)) ++ .map_err(|_| SleepError::UnsupportedPmControl)?; ++ pm1b.write(u64::from(armed_b | ACPI_SLP_EN)) ++ .map_err(|_| SleepError::UnsupportedPmControl)?; ++ } ++ ++ // SAFETY: WBINVD is required here to flush dirty cache lines before firmware powers down the CPU package. ++ unsafe { ++ core::arch::asm!("wbinvd", options(nostack, preserves_flags)); ++ } ++ ++ registers ++ .pm1_control_registers ++ .pm1a ++ .write(u64::from(armed_a | ACPI_SLP_EN)) ++ .map_err(|_| SleepError::UnsupportedPmControl)?; ++ ++ Ok(()) ++} ++ ++#[unsafe(naked)] ++unsafe extern "sysv64" fn enter_sleep_raw(state: usize) -> usize { ++ core::arch::naked_asm!( ++ "mov rsi, rsp", ++ "jmp {inner}", ++ inner = sym enter_sleep_raw_inner, ++ ); ++} ++ ++extern "C" fn enter_sleep_raw_inner(state: usize, entry_rsp: usize) -> usize { ++ let state = match state { ++ 3 => SleepState::S3, ++ 5 => SleepState::S5, ++ _ => return SleepError::InvalidSleepObject.code(), ++ }; ++ ++ let (registers, mut facs, interpreter) = match load_interpreter() { ++ Ok(tuple) => tuple, ++ Err(error) => return error.code(), ++ }; ++ let sleep_type = match sleep_type_data_from_interpreter(&interpreter, state) { ++ Ok(data) => data, ++ Err(error) => return error.code(), ++ }; ++ ++ let mut context = save_cpu_context(entry_rsp); ++ context.facs_address = facs.physical_start; ++ install_wake_trampoline(context.runtime_rsp, context.cr3); ++ set_firmware_waking_vector(&mut facs, WAKE_TRAMPOLINE_PHYS); ++ ++ { ++ let mut saved = SAVED_CONTEXT.lock(); ++ *saved = Some(context); ++ } ++ ++ // SAFETY: Suspend entry must not be interrupted while the wake vector and PM1 control block are being armed. ++ unsafe { ++ interrupt::disable(); ++ } ++ ++ if let Err(error) = write_pm1_control_block(registers.as_ref(), sleep_type) { ++ return error.code(); ++ } ++ ++ // SAFETY: The final CLI+HLT sequence is the architectural handoff point after asserting SLP_EN. ++ unsafe { ++ core::arch::asm!("cli; hlt", options(nostack)); ++ } ++ ++ SleepError::SleepDidNotEnter.code() ++} ++ ++extern "C" fn resume_from_s3_trampoline() -> ! { ++ let mut saved = SAVED_CONTEXT.lock(); ++ let context = saved.take().expect("S3 wake trampoline resumed without saved CPU context"); ++ drop(saved); ++ ++ // SAFETY: The saved FACS physical address was captured from the validated FADT during suspend entry. ++ if context.facs_address != 0 { ++ let mut facs = unsafe { ++ KernelAcpiHandler.map_physical_region::( ++ context.facs_address, ++ core::mem::size_of::(), ++ ) ++ }; ++ set_firmware_waking_vector(&mut facs, 0); ++ } ++ ++ // SAFETY: The wake trampoline already switched to the saved kernel CR3 and long mode, so the remaining restores are architectural register state only. ++ unsafe { ++ x86::msr::wrmsr(x86::msr::IA32_EFER, context.efer); ++ core::arch::asm!("mov cr3, {}", in(reg) context.cr3, options(nostack)); ++ core::arch::asm!("mov cr4, {}", in(reg) context.cr4, options(nostack)); ++ core::arch::asm!("mov cr2, {}", in(reg) context.cr2, options(nostack)); ++ core::arch::asm!("mov cr0, {}", in(reg) context.cr0, options(nostack)); ++ core::arch::asm!("lgdt [{}]", in(reg) &context.gdtr, options(nostack)); ++ core::arch::asm!("lidt [{}]", in(reg) &context.idtr, options(nostack)); ++ ++ task::load_tr(SegmentSelector::new(crate::arch::gdt::GDT_TSS as u16, Ring::Ring0)); ++ ++ x86::msr::wrmsr(x86::msr::IA32_FS_BASE, context.fs_base); ++ x86::msr::wrmsr(x86::msr::IA32_GS_BASE, context.gs_base); ++ x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, context.kernel_gs_base); ++ } ++ ++ restore_fpu_state(&context); ++ ++ // SAFETY: Returning with the original entry stack and RFLAGS completes the suspend call as a successful function return. ++ unsafe { ++ core::arch::asm!( ++ "mov rsp, {entry_rsp}", ++ "push {rflags}", ++ "popfq", ++ "xor eax, eax", ++ "ret", ++ entry_rsp = in(reg) context.entry_rsp, ++ rflags = in(reg) context.rflags, ++ options(noreturn) ++ ); ++ } ++} ++ ++pub fn enter_sleep_state(state: SleepState) -> core::result::Result<(), SleepError> { ++ #[cfg(not(target_arch = "x86_64"))] ++ { ++ let _ = state; ++ return Err(SleepError::UnsupportedArch); ++ } ++ ++ #[cfg(target_arch = "x86_64")] ++ { ++ let raw = unsafe { ++ enter_sleep_raw(match state { ++ SleepState::S3 => 3, ++ SleepState::S5 => 5, ++ }) ++ }; ++ if raw == SLEEP_RETURN_OK { ++ Ok(()) ++ } else { ++ Err(SleepError::from_code(raw)) ++ } ++ } ++} ++ ++pub fn available_sleep_states() -> &'static [u8] { ++ if sleep_type_data(SleepState::S3).is_ok() { ++ b"S3\nS5\n" ++ } else { ++ b"S5\n" ++ } ++} ++ ++pub fn trigger_sleep_request(request: &str) -> Result<(), Error> { ++ match request.trim() { ++ "S3" => enter_sleep_state(SleepState::S3).map_err(|_| Error::new(EIO)), ++ "S5" => enter_sleep_state(SleepState::S5).map_err(|_| Error::new(EIO)), ++ _ => Err(Error::new(EINVAL)), ++ } ++} +diff --git a/src/asm/x86_64/s3_wakeup.asm b/src/asm/x86_64/s3_wakeup.asm +new file mode 100644 +index 0000000..7beeccf +--- /dev/null ++++ b/src/asm/x86_64/s3_wakeup.asm +@@ -0,0 +1,110 @@ ++; ACPI S3 wake trampoline ++; compiled with nasm by build.rs, copied to physical 0x8000 before S3 entry ++ ++ORG 0x8000 ++SECTION .text ++USE16 ++ ++trampoline: ++ jmp short startup_wake ++ times 8 - ($ - trampoline) nop ++ .stack: dq 0 ++ .page_table: dq 0 ++ .code: dq 0 ++ ++startup_wake: ++ cli ++ ++ xor ax, ax ++ mov ds, ax ++ mov es, ax ++ mov ss, ax ++ mov sp, 0 ++ ++ mov edi, [trampoline.page_table] ++ mov cr3, edi ++ ++ mov eax, cr0 ++ and al, 11110011b ++ or al, 00100010b ++ mov cr0, eax ++ ++ mov eax, cr4 ++ or eax, 1 << 9 | 1 << 7 | 1 << 5 | 1 << 4 ++ mov cr4, eax ++ ++ fninit ++ ++ lgdt [gdtr] ++ ++ mov ecx, 0xC0000080 ++ rdmsr ++ or eax, 1 << 11 | 1 << 8 ++ wrmsr ++ ++ mov ebx, cr0 ++ or ebx, 1 << 31 | 1 << 16 | 1 ++ mov cr0, ebx ++ ++ jmp gdt.kernel_code:long_mode_wake ++ ++USE64 ++long_mode_wake: ++ mov rax, gdt.kernel_data ++ mov ds, rax ++ mov es, rax ++ mov fs, rax ++ mov gs, rax ++ mov ss, rax ++ ++ mov rsp, [trampoline.stack] ++ mov rax, [trampoline.code] ++ jmp rax ++ ++struc GDTEntry ++ .limitl resw 1 ++ .basel resw 1 ++ .basem resb 1 ++ .attribute resb 1 ++ .flags__limith resb 1 ++ .baseh resb 1 ++endstruc ++ ++attrib: ++ .present equ 1 << 7 ++ .user equ 1 << 4 ++ .code equ 1 << 3 ++ .writable equ 1 << 1 ++ ++flags: ++ .long_mode equ 1 << 5 ++ ++gdtr: ++ dw gdt.end + 1 ++ dq gdt ++ ++gdt: ++.null equ $ - gdt ++ dq 0 ++ ++.kernel_code equ $ - gdt ++istruc GDTEntry ++ at GDTEntry.limitl, dw 0 ++ at GDTEntry.basel, dw 0 ++ at GDTEntry.basem, db 0 ++ at GDTEntry.attribute, db attrib.present | attrib.user | attrib.code ++ at GDTEntry.flags__limith, db flags.long_mode ++ at GDTEntry.baseh, db 0 ++iend ++ ++.kernel_data equ $ - gdt ++istruc GDTEntry ++ at GDTEntry.limitl, dw 0 ++ at GDTEntry.basel, dw 0 ++ at GDTEntry.basem, db 0 ++ at GDTEntry.attribute, db attrib.present | attrib.user | attrib.writable ++ at GDTEntry.flags__limith, db 0 ++ at GDTEntry.baseh, db 0 ++iend ++ ++.end equ $ - gdt diff --git a/local/patches/kernel/P5-context-mod-sched.patch b/local/patches/kernel/P5-context-mod-sched.patch new file mode 100644 index 00000000..58a60a58 --- /dev/null +++ b/local/patches/kernel/P5-context-mod-sched.patch @@ -0,0 +1,13 @@ +diff --git a/src/context/mod.rs b/src/context/mod.rs +index 37c73f5..4f5d60f 100644 +--- a/src/context/mod.rs ++++ b/src/context/mod.rs +@@ -22,7 +22,7 @@ use crate::{ + + use self::context::Kstack; + pub use self::{ +- context::{BorrowedHtBuf, Context, Status}, ++ context::{BorrowedHtBuf, Context, SchedPolicy, Status}, + switch::switch, + }; + diff --git a/local/patches/kernel/P5-proc-setschedpolicy.patch b/local/patches/kernel/P5-proc-setschedpolicy.patch new file mode 100644 index 00000000..07e234a1 --- /dev/null +++ b/local/patches/kernel/P5-proc-setschedpolicy.patch @@ -0,0 +1,152 @@ +diff --git a/src/scheme/proc.rs b/src/scheme/proc.rs +index 47588e1..6578761 100644 +--- a/src/scheme/proc.rs ++++ b/src/scheme/proc.rs +@@ -1,7 +1,7 @@ + use crate::{ + context::{ + self, +- context::{HardBlockedReason, LockedFdTbl, SignalState}, ++ context::{HardBlockedReason, LockedFdTbl, SchedPolicy, SignalState}, + file::InternalFlags, + memory::{handle_notify_files, AddrSpace, AddrSpaceWrapper, Grant, PageSpan}, + Context, ContextLock, Status, +@@ -105,6 +105,7 @@ enum ContextHandle { + // Attr handles, to set ens/euid/egid/pid. + Authority, + Attr, ++ Groups, + + Status { + privileged: bool, +@@ -145,6 +146,7 @@ enum ContextHandle { + // directory. + OpenViaDup, + SchedAffinity, ++ SchedPolicy, + + MmapMinAddr(Arc), + } +@@ -249,6 +251,9 @@ impl ProcScheme { + false, + ), + "sched-affinity" => (ContextHandle::SchedAffinity, true), ++ // TODO: Switch this kernel-local proc handle over to a stable upstream ++ // redox_syscall ProcCall::SetSchedPolicy opcode once that lands. ++ "sched-policy" => (ContextHandle::SchedPolicy, false), + "status" => (ContextHandle::Status { privileged: false }, false), + _ if path.starts_with("auth-") => { + let nonprefix = &path["auth-".len()..]; +@@ -261,6 +266,7 @@ impl ProcScheme { + let handle = match actual_name { + "attrs" => ContextHandle::Attr, + "status" => ContextHandle::Status { privileged: true }, ++ "groups" => ContextHandle::Groups, + _ => return Err(Error::new(ENOENT)), + }; + +@@ -306,6 +312,11 @@ impl ProcScheme { + let id = NonZeroUsize::new(NEXT_ID.fetch_add(1, Ordering::Relaxed)) + .ok_or(Error::new(EMFILE))?; + let context = context::spawn(true, Some(id), ret, token)?; ++ { ++ let parent_groups = ++ context::current().read(token.token()).groups.clone(); ++ context.write(token.token()).groups = parent_groups; ++ } + HANDLES.write(token.token()).insert( + id.get(), + Handle { +@@ -1165,6 +1176,20 @@ impl ContextHandle { + + Ok(size_of_val(&mask)) + } ++ Self::SchedPolicy => { ++ if buf.len() != 2 { ++ return Err(Error::new(EINVAL)); ++ } ++ ++ let [policy, rt_priority] = unsafe { buf.read_exact::<[u8; 2]>()? }; ++ let sched_policy = SchedPolicy::try_from_raw(policy).ok_or(Error::new(EINVAL))?; ++ ++ context ++ .write(token.token()) ++ .set_sched_policy(sched_policy, rt_priority); ++ ++ Ok(2) ++ } + ContextHandle::Status { privileged } => { + let mut args = buf.usizes(); + +@@ -1268,9 +1293,42 @@ impl ContextHandle { + guard.pid = info.pid as usize; + guard.euid = info.euid; + guard.egid = info.egid; +- guard.prio = (info.prio as usize).min(39); ++ guard.set_sched_other_prio(info.prio as usize); + Ok(size_of::()) + } ++ Self::Groups => { ++ const NGROUPS_MAX: usize = 65536; ++ if buf.len() % size_of::() != 0 { ++ return Err(Error::new(EINVAL)); ++ } ++ let count = buf.len() / size_of::(); ++ if count > NGROUPS_MAX { ++ return Err(Error::new(EINVAL)); ++ } ++ let mut groups = Vec::with_capacity(count); ++ for chunk in buf.in_exact_chunks(size_of::()).take(count) { ++ groups.push(chunk.read_u32()?); ++ } ++ let proc_id = { ++ let guard = context.read(token.token()); ++ guard.owner_proc_id ++ }; ++ { ++ let mut guard = context.write(token.token()); ++ guard.groups = groups.clone(); ++ } ++ if let Some(pid) = proc_id { ++ let mut contexts = context::contexts(token.downgrade()); ++ let (contexts, mut t) = contexts.token_split(); ++ for context_ref in contexts.iter() { ++ let mut ctx = context_ref.write(t.token()); ++ if ctx.owner_proc_id == Some(pid) { ++ ctx.groups = groups.clone(); ++ } ++ } ++ } ++ Ok(count * size_of::()) ++ } + ContextHandle::OpenViaDup => { + let mut args = buf.usizes(); + +@@ -1427,6 +1485,11 @@ impl ContextHandle { + + buf.copy_exactly(crate::cpu_set::mask_as_bytes(&mask))?; + Ok(size_of_val(&mask)) ++ } ++ ContextHandle::SchedPolicy => { ++ let context = context.read(token.token()); ++ let data = [context.sched_policy as u8, context.sched_rt_priority]; ++ buf.copy_common_bytes_from_slice(&data) + } // TODO: Replace write() with SYS_SENDFD? + ContextHandle::Status { .. } => { + let status = { +@@ -1475,6 +1538,15 @@ impl ContextHandle { + debug_name, + }) + } ++ Self::Groups => { ++ let c = &context.read(token.token()); ++ let max = buf.len() / size_of::(); ++ let count = c.groups.len().min(max); ++ for (chunk, gid) in buf.in_exact_chunks(size_of::()).zip(&c.groups).take(count) { ++ chunk.copy_from_slice(&gid.to_ne_bytes())?; ++ } ++ Ok(count * size_of::()) ++ } + ContextHandle::Sighandler => { + let data = match context.read(token.token()).sig { + Some(ref sig) => SetSighandlerData { diff --git a/local/patches/kernel/P5-sched-policy-context.patch b/local/patches/kernel/P5-sched-policy-context.patch new file mode 100644 index 00000000..067565ac --- /dev/null +++ b/local/patches/kernel/P5-sched-policy-context.patch @@ -0,0 +1,176 @@ +diff --git a/src/context/context.rs b/src/context/context.rs +index c97c516..8a8b078 100644 +--- a/src/context/context.rs ++++ b/src/context/context.rs +@@ -18,7 +18,8 @@ use crate::{ + cpu_stats, + ipi::{ipi, IpiKind, IpiTarget}, + memory::{ +- allocate_p2frame, deallocate_p2frame, Enomem, Frame, RaiiFrame, RmmA, RmmArch, PAGE_SIZE, ++ allocate_p2frame, deallocate_p2frame, Enomem, Frame, PhysicalAddress, RaiiFrame, RmmA, ++ RmmArch, PAGE_SIZE, + }, + percpu::PercpuBlock, + scheme::{CallerCtx, FileHandle, SchemeId}, +@@ -62,6 +63,38 @@ impl Status { + } + } + ++pub const SCHED_PRIORITY_LEVELS: usize = 40; ++pub const DEFAULT_SCHED_OTHER_PRIORITY: usize = 20; ++pub const DEFAULT_SCHED_RR_QUANTUM: u128 = 100_000_000; ++ ++#[repr(u8)] ++#[derive(Clone, Copy, Debug, PartialEq, Eq)] ++pub enum SchedPolicy { ++ Fifo = 0, ++ RoundRobin = 1, ++ Other = 2, ++} ++ ++impl SchedPolicy { ++ pub fn try_from_raw(raw: u8) -> Option { ++ match raw { ++ 0 => Some(Self::Fifo), ++ 1 => Some(Self::RoundRobin), ++ 2 => Some(Self::Other), ++ _ => None, ++ } ++ } ++} ++ ++pub fn rt_priority_to_kernel_prio(rt_priority: u8) -> usize { ++ (SCHED_PRIORITY_LEVELS - 1) ++ .saturating_sub((usize::from(rt_priority.min(99)) * (SCHED_PRIORITY_LEVELS - 1)) / 99) ++} ++ ++fn clamp_sched_other_prio(prio: usize) -> usize { ++ prio.min(SCHED_PRIORITY_LEVELS - 1) ++} ++ + #[derive(Clone, Debug)] + pub enum HardBlockedReason { + /// "SIGSTOP", only procmgr is allowed to switch contexts this state +@@ -140,6 +173,17 @@ pub struct Context { + pub fmap_ret: Option, + /// Priority + pub prio: usize, ++ pub sched_policy: SchedPolicy, ++ pub sched_rt_priority: u8, ++ pub sched_rr_ticks_consumed: u32, ++ pub sched_static_prio: usize, ++ pub sched_rr_quantum: u128, ++ #[allow(dead_code)] ++ pub futex_pi_boost: bool, ++ #[allow(dead_code)] ++ pub futex_pi_original_prio: usize, ++ #[allow(dead_code)] ++ pub futex_pi_waiters: Vec, + + // TODO: id can reappear after wraparound? + pub owner_proc_id: Option, +@@ -148,6 +192,8 @@ pub struct Context { + pub euid: u32, + pub egid: u32, + pub pid: usize, ++ /// Supplementary group IDs for access control decisions. ++ pub groups: Vec, + + // See [`PreemptGuard`] + // +@@ -197,13 +243,22 @@ impl Context { + files: Arc::new(RwLock::new(FdTbl::new())), + userspace: false, + fmap_ret: None, +- prio: 20, ++ prio: DEFAULT_SCHED_OTHER_PRIORITY, ++ sched_policy: SchedPolicy::Other, ++ sched_rt_priority: 0, ++ sched_rr_ticks_consumed: 0, ++ sched_static_prio: DEFAULT_SCHED_OTHER_PRIORITY, ++ sched_rr_quantum: DEFAULT_SCHED_RR_QUANTUM, ++ futex_pi_boost: false, ++ futex_pi_original_prio: DEFAULT_SCHED_OTHER_PRIORITY, ++ futex_pi_waiters: Vec::new(), + being_sigkilled: false, + owner_proc_id, + + euid: 0, + egid: 0, + pid: 0, ++ groups: Vec::new(), + + #[cfg(feature = "syscall_debug")] + syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(), +@@ -218,11 +273,47 @@ impl Context { + self.preempt_locks == 0 + } + ++ fn base_sched_prio(&self) -> usize { ++ match self.sched_policy { ++ SchedPolicy::Other => clamp_sched_other_prio(self.sched_static_prio), ++ SchedPolicy::Fifo | SchedPolicy::RoundRobin => { ++ rt_priority_to_kernel_prio(self.sched_rt_priority) ++ } ++ } ++ } ++ ++ fn apply_sched_prio(&mut self) { ++ let base_prio = self.base_sched_prio(); ++ if self.futex_pi_boost { ++ self.futex_pi_original_prio = base_prio; ++ self.prio = self.prio.min(base_prio); ++ } else { ++ self.futex_pi_original_prio = base_prio; ++ self.prio = base_prio; ++ } ++ } ++ ++ pub fn set_sched_other_prio(&mut self, prio: usize) { ++ self.sched_static_prio = clamp_sched_other_prio(prio); ++ self.apply_sched_prio(); ++ } ++ ++ pub fn set_sched_policy(&mut self, sched_policy: SchedPolicy, rt_priority: u8) { ++ self.sched_policy = sched_policy; ++ self.sched_rt_priority = match sched_policy { ++ SchedPolicy::Other => 0, ++ SchedPolicy::Fifo | SchedPolicy::RoundRobin => rt_priority.min(99), ++ }; ++ self.sched_rr_ticks_consumed = 0; ++ self.apply_sched_prio(); ++ } ++ + /// Block the context, and return true if it was runnable before being blocked + pub fn block(&mut self, reason: &'static str) -> bool { + if self.status.is_runnable() { + self.status = Status::Blocked; + self.status_reason = reason; ++ self.sched_rr_ticks_consumed = 0; + true + } else { + false +@@ -232,6 +323,7 @@ impl Context { + pub fn hard_block(&mut self, reason: HardBlockedReason) -> bool { + if self.status.is_runnable() { + self.status = Status::HardBlocked { reason }; ++ self.sched_rr_ticks_consumed = 0; + + true + } else { +@@ -261,6 +353,7 @@ impl Context { + if self.status.is_soft_blocked() { + self.status = Status::Runnable; + self.status_reason = ""; ++ self.sched_rr_ticks_consumed = 0; + + true + } else { +@@ -479,6 +572,7 @@ impl Context { + uid: self.euid, + gid: self.egid, + pid: self.pid, ++ groups: self.groups.clone(), + } + } + } diff --git a/local/patches/kernel/P5-sched-rt-policy.patch b/local/patches/kernel/P5-sched-rt-policy.patch new file mode 100644 index 00000000..8d491afa --- /dev/null +++ b/local/patches/kernel/P5-sched-rt-policy.patch @@ -0,0 +1,150 @@ +diff --git a/src/context/switch.rs b/src/context/switch.rs +index 86684c8..aeb29c9 100644 +--- a/src/context/switch.rs ++++ b/src/context/switch.rs +@@ -5,7 +5,7 @@ + use crate::{ + context::{ + self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard, +- Context, ContextLock, WeakContextRef, ++ Context, ContextLock, SchedPolicy, WeakContextRef, + }, + cpu_set::LogicalCpuId, + cpu_stats::{self, CpuState}, +@@ -33,35 +33,17 @@ const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [ + 70, 56, 45, 36, 29, 23, 18, 15, + ]; + +-/// Determines if a given context is eligible to be scheduled on a given CPU (in +-/// principle, the current CPU). +-/// +-/// # Safety +-/// This function is unsafe because it modifies the `context`'s state directly without synchronization. +-/// +-/// # Parameters +-/// - `context`: The context (process/thread) to be checked. +-/// - `cpu_id`: The logical ID of the CPU on which the context is being scheduled. +-/// +-/// # Returns +-/// - `UpdateResult::CanSwitch`: If the context can be switched to. +-/// - `UpdateResult::Skip`: If the context should be skipped (e.g., it's running on another CPU). + unsafe fn update_runnable( + context: &mut Context, + cpu_id: LogicalCpuId, + switch_time: u128, + ) -> UpdateResult { +- // Ignore contexts that are already running. + if context.running { + return UpdateResult::Skip; + } +- +- // Ignore contexts assigned to other CPUs. + if !context.sched_affinity.contains(cpu_id) { + return UpdateResult::Skip; + } +- +- // If context is soft-blocked and has a wake-up time, check if it should wake up. + if context.status.is_soft_blocked() + && let Some(wake) = context.wake + && switch_time >= wake +@@ -69,8 +51,6 @@ unsafe fn update_runnable( + context.wake = None; + context.unblock_no_ipi(); + } +- +- // If the context is runnable, indicate it can be switched to. + if context.status.is_runnable() { + UpdateResult::CanSwitch + } else { +@@ -95,7 +75,7 @@ pub fn tick(token: &mut CleanLockToken) { + let new_ticks = ticks_cell.get() + 1; + ticks_cell.set(new_ticks); + +- // Trigger a context switch after every 3 ticks (approx. 6.75 ms). ++ // Trigger a context switch after every 3 ticks. + if new_ticks >= 3 { + switch(token); + crate::context::signal::signal_handler(token); +@@ -167,10 +147,7 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + let mut prev_context_guard = unsafe { prev_context_lock.write_arc() }; + + if !prev_context_guard.is_preemptable() { +- // Unset global lock + arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); +- +- // Pretend to have finished switching, so CPU is not idled + return SwitchResult::Switched; + } + +@@ -377,6 +354,71 @@ fn select_next_context( + let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum(); + let mut skipped_contexts = 0; + ++ // PASS 0: SCHED_FIFO and SCHED_RR — scan for RT contexts to schedule. ++ // When a runnable RT context is found, it takes priority over all SCHED_OTHER. ++ for prio in 0..40 { ++ let rt_contexts = contexts_list ++ .get_mut(prio) ++ .expect("prio should be between [0, 39]"); ++ let len = rt_contexts.len(); ++ for _ in 0..len { ++ let (rt_ref, rt_lock) = match rt_contexts.pop_front() { ++ Some(lock) => match lock.upgrade() { ++ Some(l) => (lock, l), ++ None => { ++ skipped_contexts += 1; ++ continue; ++ } ++ }, ++ None => break, ++ }; ++ if Arc::ptr_eq(&rt_lock, &idle_context) { ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ // Current RT thread: if runnable with no higher-prio RT found yet, ++ // keep it running (no demotion to SCHED_OTHER) ++ if Arc::ptr_eq(&rt_lock, &prev_context_lock) { ++ let mut rt_guard = unsafe { rt_lock.write_arc() }; ++ if rt_guard.status.is_runnable() ++ && (rt_guard.sched_policy == SchedPolicy::Fifo ++ || rt_guard.sched_policy == SchedPolicy::RoundRobin) ++ { ++ percpu.balance.set(balance); ++ percpu.last_queue.set(i); ++ return Ok(Some(rt_guard)); ++ } ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ let mut rt_guard = unsafe { rt_lock.write_arc() }; ++ if !rt_guard.status.is_runnable() || rt_guard.running ++ || !rt_guard.sched_affinity.contains(cpu_id) ++ { ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ if rt_guard.sched_policy == SchedPolicy::Fifo ++ || rt_guard.sched_policy == SchedPolicy::RoundRobin ++ { ++ percpu.balance.set(balance); ++ percpu.last_queue.set(i); ++ if !Arc::ptr_eq(&prev_context_lock, &idle_context) { ++ let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock)); ++ if prev_context_guard.status.is_runnable() { ++ contexts_list[prev_context_guard.prio].push_back(prev_ctx); ++ } else { ++ idle_contexts(token.token()).push_back(prev_ctx); ++ } ++ } ++ return Ok(Some(rt_guard)); ++ } ++ rt_contexts.push_back(rt_ref); ++ } ++ } ++ ++ // PASS 1: SCHED_OTHER — existing DWRR deficit tracking ++ + 'priority: loop { + i = (i + 1) % 40; + total_iters += 1; diff --git a/local/patches/kernel/P5-scheme-sched-id.patch b/local/patches/kernel/P5-scheme-sched-id.patch new file mode 100644 index 00000000..5554697b --- /dev/null +++ b/local/patches/kernel/P5-scheme-sched-id.patch @@ -0,0 +1,20 @@ +diff --git a/src/scheme/mod.rs b/src/scheme/mod.rs +index d30272c..9da2b28 100644 +--- a/src/scheme/mod.rs ++++ b/src/scheme/mod.rs +@@ -777,6 +777,7 @@ pub struct CallerCtx { + pub pid: usize, + pub uid: u32, + pub gid: u32, ++ pub groups: alloc::vec::Vec, + } + impl CallerCtx { + pub fn filter_uid_gid(self, euid: u32, egid: u32) -> Self { +@@ -785,6 +786,7 @@ impl CallerCtx { + pid: self.pid, + uid: euid, + gid: egid, ++ groups: self.groups, + } + } else { + self diff --git a/local/patches/kernel/P6-futex-sharding.patch b/local/patches/kernel/P6-futex-sharding.patch new file mode 100644 index 00000000..d933e094 --- /dev/null +++ b/local/patches/kernel/P6-futex-sharding.patch @@ -0,0 +1,42 @@ +diff --git a/src/syscall/futex.rs b/src/syscall/futex.rs +index 4c187b8..9884d2b 100644 +--- a/src/syscall/futex.rs ++++ b/src/syscall/futex.rs +@@ -49,8 +49,13 @@ pub struct FutexEntry { + // implement that fully in userspace. Although futex is probably the best API for process-shared + // POSIX synchronization primitives, a local hash table and wait-for-thread kernel APIs (e.g. + // lwp_park/lwp_unpark from NetBSD) could be a simpler replacement. +-static FUTEXES: Mutex = +- Mutex::new(FutexList::with_hasher(DefaultHashBuilder::new())); ++const FUTEX_SHARDS: usize = 64; ++ ++fn futex_shard(phys: PhysicalAddress) -> usize { ++ (phys.data() as usize >> 12) % FUTEX_SHARDS ++} ++ ++static FUTEXES: [Mutex; FUTEX_SHARDS] = [const { Mutex::new(FutexList::with_hasher(DefaultHashBuilder::new())) }; FUTEX_SHARDS]; + + fn validate_and_translate_virt(space: &AddrSpace, addr: VirtualAddress) -> Option { + // TODO: Move this elsewhere! +@@ -97,7 +102,7 @@ pub fn futex( + { + // TODO: Lock ordering violation + let mut token = unsafe { CleanLockToken::new() }; +- let mut futexes = FUTEXES.lock(token.token()); ++ let mut futexes = FUTEXES[futex_shard(target_physaddr)].lock(token.token()); + let (futexes, mut token) = futexes.token_split(); + + let (fetched, expected) = if op == FUTEX_WAIT { +@@ -181,10 +186,11 @@ pub fn futex( + } + FUTEX_WAKE => { + let mut woken = 0; ++ let shard = futex_shard(target_physaddr); + + { + drop(addr_space_guard); +- let mut futexes_map = FUTEXES.lock(token.token()); ++ let mut futexes_map = FUTEXES[shard].lock(token.token()); + let (futexes_map, mut token) = futexes_map.token_split(); + + let is_empty = if let Some(futexes) = futexes_map.get_mut(&target_physaddr) { diff --git a/local/patches/kernel/P6-percpu-runqueues.patch b/local/patches/kernel/P6-percpu-runqueues.patch new file mode 100644 index 00000000..1afe030b --- /dev/null +++ b/local/patches/kernel/P6-percpu-runqueues.patch @@ -0,0 +1,89 @@ +diff --git a/src/percpu.rs b/src/percpu.rs +index f4ad5e6..1844d62 100644 +--- a/src/percpu.rs ++++ b/src/percpu.rs +@@ -1,4 +1,5 @@ + use alloc::{ ++ collections::VecDeque, + sync::{Arc, Weak}, + vec::Vec, + }; +@@ -12,7 +13,10 @@ use syscall::PtraceFlags; + + use crate::{ + arch::device::ArchPercpuMisc, +- context::{empty_cr3, memory::AddrSpaceWrapper, switch::ContextSwitchPercpu}, ++ context::{ ++ empty_cr3, memory::AddrSpaceWrapper, switch::ContextSwitchPercpu, WeakContextRef, ++ RUN_QUEUE_COUNT, ++ }, + cpu_set::{LogicalCpuId, MAX_CPU_COUNT}, + cpu_stats::{CpuStats, CpuStatsData}, + ptrace::Session, +@@ -20,6 +24,42 @@ use crate::{ + syscall::debug::SyscallDebugInfo, + }; + ++#[allow(dead_code)] ++pub struct PerCpuSched { ++ pub run_queues: [VecDeque; RUN_QUEUE_COUNT], ++ pub run_queues_lock: AtomicBool, ++ pub balance: Cell<[usize; RUN_QUEUE_COUNT]>, ++ pub last_queue: Cell, ++} ++ ++impl PerCpuSched { ++ pub const fn new() -> Self { ++ const EMPTY: VecDeque = VecDeque::new(); ++ Self { ++ run_queues: [EMPTY; RUN_QUEUE_COUNT], ++ run_queues_lock: AtomicBool::new(false), ++ balance: Cell::new([0; RUN_QUEUE_COUNT]), ++ last_queue: Cell::new(0), ++ } ++ } ++ ++ pub fn take_lock(&self) { ++ while self ++ .run_queues_lock ++ .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed) ++ .is_err() ++ { ++ while self.run_queues_lock.load(Ordering::Relaxed) { ++ core::hint::spin_loop(); ++ } ++ } ++ } ++ ++ pub fn release_lock(&self) { ++ self.run_queues_lock.store(false, Ordering::Release); ++ } ++} ++ + /// The percpu block, that stored all percpu variables. + pub struct PercpuBlock { + /// A unique immutable number that identifies the current CPU - used for scheduling +@@ -31,7 +71,12 @@ pub struct PercpuBlock { + pub current_addrsp: RefCell>>, + pub new_addrsp_tmp: Cell>>, + pub wants_tlb_shootdown: AtomicBool, +- pub balance: Cell<[usize; 40]>, ++ ++ pub sched: PerCpuSched, ++ ++ // Legacy DWRR state used by context/switch.rs until the per-CPU scheduler migration is ++ // finished. ++ pub balance: Cell<[usize; RUN_QUEUE_COUNT]>, + pub last_queue: Cell, + + // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it +@@ -187,7 +232,8 @@ impl PercpuBlock { + current_addrsp: RefCell::new(None), + new_addrsp_tmp: Cell::new(None), + wants_tlb_shootdown: AtomicBool::new(false), +- balance: Cell::new([0; 40]), ++ sched: PerCpuSched::new(), ++ balance: Cell::new([0; RUN_QUEUE_COUNT]), + last_queue: Cell::new(39), + ptrace_flags: Cell::new(PtraceFlags::empty()), + ptrace_session: RefCell::new(None), diff --git a/local/patches/kernel/P6-vruntime-context.patch b/local/patches/kernel/P6-vruntime-context.patch new file mode 100644 index 00000000..f6050879 --- /dev/null +++ b/local/patches/kernel/P6-vruntime-context.patch @@ -0,0 +1,180 @@ +diff --git a/src/context/context.rs b/src/context/context.rs +index c97c516..a0814fa 100644 +--- a/src/context/context.rs ++++ b/src/context/context.rs +@@ -18,7 +18,8 @@ use crate::{ + cpu_stats, + ipi::{ipi, IpiKind, IpiTarget}, + memory::{ +- allocate_p2frame, deallocate_p2frame, Enomem, Frame, RaiiFrame, RmmA, RmmArch, PAGE_SIZE, ++ allocate_p2frame, deallocate_p2frame, Enomem, Frame, PhysicalAddress, RaiiFrame, RmmA, ++ RmmArch, PAGE_SIZE, + }, + percpu::PercpuBlock, + scheme::{CallerCtx, FileHandle, SchemeId}, +@@ -62,6 +63,38 @@ impl Status { + } + } + ++pub const SCHED_PRIORITY_LEVELS: usize = 40; ++pub const DEFAULT_SCHED_OTHER_PRIORITY: usize = 20; ++pub const DEFAULT_SCHED_RR_QUANTUM: u128 = 100_000_000; ++ ++#[repr(u8)] ++#[derive(Clone, Copy, Debug, PartialEq, Eq)] ++pub enum SchedPolicy { ++ Fifo = 0, ++ RoundRobin = 1, ++ Other = 2, ++} ++ ++impl SchedPolicy { ++ pub fn try_from_raw(raw: u8) -> Option { ++ match raw { ++ 0 => Some(Self::Fifo), ++ 1 => Some(Self::RoundRobin), ++ 2 => Some(Self::Other), ++ _ => None, ++ } ++ } ++} ++ ++pub fn rt_priority_to_kernel_prio(rt_priority: u8) -> usize { ++ (SCHED_PRIORITY_LEVELS - 1) ++ .saturating_sub((usize::from(rt_priority.min(99)) * (SCHED_PRIORITY_LEVELS - 1)) / 99) ++} ++ ++fn clamp_sched_other_prio(prio: usize) -> usize { ++ prio.min(SCHED_PRIORITY_LEVELS - 1) ++} ++ + #[derive(Clone, Debug)] + pub enum HardBlockedReason { + /// "SIGSTOP", only procmgr is allowed to switch contexts this state +@@ -140,6 +173,20 @@ pub struct Context { + pub fmap_ret: Option, + /// Priority + pub prio: usize, ++ pub sched_policy: SchedPolicy, ++ pub sched_rt_priority: u8, ++ pub sched_rr_ticks_consumed: u32, ++ pub sched_static_prio: usize, ++pub sched_rr_quantum: u128, ++ /// Virtual runtime for SCHED_OTHER fair scheduling. ++ /// CPU-bound threads accumulate vruntime faster; I/O-bound stay lower. ++ pub vruntime: u128, ++ #[allow(dead_code)] ++ pub futex_pi_boost: bool, ++ #[allow(dead_code)] ++ pub futex_pi_original_prio: usize, ++ #[allow(dead_code)] ++ pub futex_pi_waiters: Vec, + + // TODO: id can reappear after wraparound? + pub owner_proc_id: Option, +@@ -148,6 +195,8 @@ pub struct Context { + pub euid: u32, + pub egid: u32, + pub pid: usize, ++ /// Supplementary group IDs for access control decisions. ++ pub groups: Vec, + + // See [`PreemptGuard`] + // +@@ -197,13 +246,23 @@ impl Context { + files: Arc::new(RwLock::new(FdTbl::new())), + userspace: false, + fmap_ret: None, +- prio: 20, ++ prio: DEFAULT_SCHED_OTHER_PRIORITY, ++ sched_policy: SchedPolicy::Other, ++ sched_rt_priority: 0, ++ sched_rr_ticks_consumed: 0, ++ sched_static_prio: DEFAULT_SCHED_OTHER_PRIORITY, ++ sched_rr_quantum: DEFAULT_SCHED_RR_QUANTUM, ++ vruntime: 0u128, ++ futex_pi_boost: false, ++ futex_pi_original_prio: DEFAULT_SCHED_OTHER_PRIORITY, ++ futex_pi_waiters: Vec::new(), + being_sigkilled: false, + owner_proc_id, + + euid: 0, + egid: 0, + pid: 0, ++ groups: Vec::new(), + + #[cfg(feature = "syscall_debug")] + syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(), +@@ -218,11 +277,47 @@ impl Context { + self.preempt_locks == 0 + } + ++ fn base_sched_prio(&self) -> usize { ++ match self.sched_policy { ++ SchedPolicy::Other => clamp_sched_other_prio(self.sched_static_prio), ++ SchedPolicy::Fifo | SchedPolicy::RoundRobin => { ++ rt_priority_to_kernel_prio(self.sched_rt_priority) ++ } ++ } ++ } ++ ++ fn apply_sched_prio(&mut self) { ++ let base_prio = self.base_sched_prio(); ++ if self.futex_pi_boost { ++ self.futex_pi_original_prio = base_prio; ++ self.prio = self.prio.min(base_prio); ++ } else { ++ self.futex_pi_original_prio = base_prio; ++ self.prio = base_prio; ++ } ++ } ++ ++ pub fn set_sched_other_prio(&mut self, prio: usize) { ++ self.sched_static_prio = clamp_sched_other_prio(prio); ++ self.apply_sched_prio(); ++ } ++ ++ pub fn set_sched_policy(&mut self, sched_policy: SchedPolicy, rt_priority: u8) { ++ self.sched_policy = sched_policy; ++ self.sched_rt_priority = match sched_policy { ++ SchedPolicy::Other => 0, ++ SchedPolicy::Fifo | SchedPolicy::RoundRobin => rt_priority.min(99), ++ }; ++ self.sched_rr_ticks_consumed = 0; ++ self.apply_sched_prio(); ++ } ++ + /// Block the context, and return true if it was runnable before being blocked + pub fn block(&mut self, reason: &'static str) -> bool { + if self.status.is_runnable() { + self.status = Status::Blocked; + self.status_reason = reason; ++ self.sched_rr_ticks_consumed = 0; + true + } else { + false +@@ -232,6 +327,7 @@ impl Context { + pub fn hard_block(&mut self, reason: HardBlockedReason) -> bool { + if self.status.is_runnable() { + self.status = Status::HardBlocked { reason }; ++ self.sched_rr_ticks_consumed = 0; + + true + } else { +@@ -261,6 +357,7 @@ impl Context { + if self.status.is_soft_blocked() { + self.status = Status::Runnable; + self.status_reason = ""; ++ self.sched_rr_ticks_consumed = 0; + + true + } else { +@@ -479,6 +576,7 @@ impl Context { + uid: self.euid, + gid: self.egid, + pid: self.pid, ++ groups: self.groups.clone(), + } + } + } diff --git a/local/patches/kernel/P6-vruntime-switch.patch b/local/patches/kernel/P6-vruntime-switch.patch new file mode 100644 index 00000000..dd3023e7 --- /dev/null +++ b/local/patches/kernel/P6-vruntime-switch.patch @@ -0,0 +1,214 @@ +diff --git a/src/context/switch.rs b/src/context/switch.rs +index 86684c8..74dd5f1 100644 +--- a/src/context/switch.rs ++++ b/src/context/switch.rs +@@ -5,7 +5,7 @@ + use crate::{ + context::{ + self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard, +- Context, ContextLock, WeakContextRef, ++ Context, ContextLock, SchedPolicy, WeakContextRef, + }, + cpu_set::LogicalCpuId, + cpu_stats::{self, CpuState}, +@@ -33,35 +33,17 @@ const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [ + 70, 56, 45, 36, 29, 23, 18, 15, + ]; + +-/// Determines if a given context is eligible to be scheduled on a given CPU (in +-/// principle, the current CPU). +-/// +-/// # Safety +-/// This function is unsafe because it modifies the `context`'s state directly without synchronization. +-/// +-/// # Parameters +-/// - `context`: The context (process/thread) to be checked. +-/// - `cpu_id`: The logical ID of the CPU on which the context is being scheduled. +-/// +-/// # Returns +-/// - `UpdateResult::CanSwitch`: If the context can be switched to. +-/// - `UpdateResult::Skip`: If the context should be skipped (e.g., it's running on another CPU). + unsafe fn update_runnable( + context: &mut Context, + cpu_id: LogicalCpuId, + switch_time: u128, + ) -> UpdateResult { +- // Ignore contexts that are already running. + if context.running { + return UpdateResult::Skip; + } +- +- // Ignore contexts assigned to other CPUs. + if !context.sched_affinity.contains(cpu_id) { + return UpdateResult::Skip; + } +- +- // If context is soft-blocked and has a wake-up time, check if it should wake up. + if context.status.is_soft_blocked() + && let Some(wake) = context.wake + && switch_time >= wake +@@ -69,8 +51,6 @@ unsafe fn update_runnable( + context.wake = None; + context.unblock_no_ipi(); + } +- +- // If the context is runnable, indicate it can be switched to. + if context.status.is_runnable() { + UpdateResult::CanSwitch + } else { +@@ -95,7 +75,7 @@ pub fn tick(token: &mut CleanLockToken) { + let new_ticks = ticks_cell.get() + 1; + ticks_cell.set(new_ticks); + +- // Trigger a context switch after every 3 ticks (approx. 6.75 ms). ++ // Trigger a context switch after every 3 ticks. + if new_ticks >= 3 { + switch(token); + crate::context::signal::signal_handler(token); +@@ -167,10 +147,7 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + let mut prev_context_guard = unsafe { prev_context_lock.write_arc() }; + + if !prev_context_guard.is_preemptable() { +- // Unset global lock + arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); +- +- // Pretend to have finished switching, so CPU is not idled + return SwitchResult::Switched; + } + +@@ -222,6 +199,13 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + // Update times + if !was_idle { + prev_context.cpu_time += switch_time.saturating_sub(prev_context.switch_time); ++ if prev_context.sched_policy == SchedPolicy::Other { ++ let actual_ns = switch_time.saturating_sub(prev_context.switch_time); ++ let weight = SCHED_PRIO_TO_WEIGHT[prev_context.sched_static_prio.min(39)] as u128; ++ let default_weight = SCHED_PRIO_TO_WEIGHT[20] as u128; ++ let delta = actual_ns.saturating_mul(default_weight) / weight.max(1); ++ prev_context.vruntime = prev_context.vruntime.saturating_add(delta); ++ } + } + next_context.switch_time = switch_time; + if next_context.userspace { +@@ -377,6 +361,121 @@ fn select_next_context( + let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum(); + let mut skipped_contexts = 0; + ++ // PASS 0: SCHED_FIFO and SCHED_RR — scan for RT contexts to schedule. ++ // When a runnable RT context is found, it takes priority over all SCHED_OTHER. ++ for prio in 0..40 { ++ let rt_contexts = contexts_list ++ .get_mut(prio) ++ .expect("prio should be between [0, 39]"); ++ let len = rt_contexts.len(); ++ for _ in 0..len { ++ let (rt_ref, rt_lock) = match rt_contexts.pop_front() { ++ Some(lock) => match lock.upgrade() { ++ Some(l) => (lock, l), ++ None => { ++ skipped_contexts += 1; ++ continue; ++ } ++ }, ++ None => break, ++ }; ++ if Arc::ptr_eq(&rt_lock, &idle_context) { ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ // Current RT thread: if runnable with no higher-prio RT found yet, ++ // keep it running (no demotion to SCHED_OTHER) ++ if Arc::ptr_eq(&rt_lock, &prev_context_lock) { ++ let rt_guard = unsafe { rt_lock.write_arc() }; ++ if rt_guard.status.is_runnable() ++ && (rt_guard.sched_policy == SchedPolicy::Fifo ++ || rt_guard.sched_policy == SchedPolicy::RoundRobin) ++ { ++ percpu.balance.set(balance); ++ percpu.last_queue.set(i); ++ return Ok(Some(rt_guard)); ++ } ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ let rt_guard = unsafe { rt_lock.write_arc() }; ++ if !rt_guard.status.is_runnable() || rt_guard.running ++ || !rt_guard.sched_affinity.contains(cpu_id) ++ { ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ if rt_guard.sched_policy == SchedPolicy::Fifo ++ || rt_guard.sched_policy == SchedPolicy::RoundRobin ++ { ++ percpu.balance.set(balance); ++ percpu.last_queue.set(i); ++ if !Arc::ptr_eq(&prev_context_lock, &idle_context) { ++ let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock)); ++ if prev_context_guard.status.is_runnable() { ++ contexts_list[prev_context_guard.prio].push_back(prev_ctx); ++ } else { ++ idle_contexts(token.token()).push_back(prev_ctx); ++ } ++ } ++ return Ok(Some(rt_guard)); ++ } ++ rt_contexts.push_back(rt_ref); ++ } ++ } ++ ++ // PASS 1: SCHED_OTHER — minimum-vruntime selection ++ { ++ let mut min_vruntime = u128::MAX; ++ let mut best: Option<(usize, WeakContextRef)> = None; ++ for (prio, queue) in contexts_list.iter().enumerate() { ++ for ctx_ref in queue.iter() { ++ if let Some(ctx_lock) = ctx_ref.upgrade() { ++ if Arc::ptr_eq(&ctx_lock, &prev_context_lock) || Arc::ptr_eq(&ctx_lock, &idle_context) { ++ continue; ++ } ++ if let Some(guard) = ctx_lock.try_read(token.token()) { ++ if guard.status.is_runnable() && !guard.running ++ && guard.sched_affinity.contains(cpu_id) ++ && guard.sched_policy == SchedPolicy::Other ++ { ++ let v = guard.vruntime; ++ drop(guard); ++ if v < min_vruntime { ++ min_vruntime = v; ++ best = Some((prio, ctx_ref.clone())); ++ } ++ } ++ } ++ } ++ } ++ } ++ if let Some((best_prio, ctx_ref)) = best { ++ { ++ let queue = contexts_list.get_mut(best_prio).expect("valid prio"); ++ queue.retain(|r| !WeakContextRef::eq(r, &ctx_ref)); ++ } ++ if let Some(ctx_lock) = ctx_ref.upgrade() { ++ let guard = unsafe { ctx_lock.write_arc() }; ++ if guard.status.is_runnable() { ++ percpu.balance.set(balance); ++ percpu.last_queue.set(i); ++ if !Arc::ptr_eq(&prev_context_lock, &idle_context) { ++ let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock)); ++ if prev_context_guard.status.is_runnable() { ++ contexts_list[prev_context_guard.prio].push_back(prev_ctx); ++ } else { ++ idle_contexts(token.token()).push_back(prev_ctx); ++ } ++ } ++ return Ok(Some(guard)); ++ } ++ } ++ } ++ } ++ ++ // PASS 2: fallback DWRR deficit tracking ++ + 'priority: loop { + i = (i + 1) % 40; + total_iters += 1; diff --git a/local/patches/kernel/P7-cache-affine-context.patch b/local/patches/kernel/P7-cache-affine-context.patch new file mode 100644 index 00000000..6a42096c --- /dev/null +++ b/local/patches/kernel/P7-cache-affine-context.patch @@ -0,0 +1,196 @@ +diff --git a/src/context/context.rs b/src/context/context.rs +index c97c516..18fbd7f 100644 +--- a/src/context/context.rs ++++ b/src/context/context.rs +@@ -18,7 +18,8 @@ use crate::{ + cpu_stats, + ipi::{ipi, IpiKind, IpiTarget}, + memory::{ +- allocate_p2frame, deallocate_p2frame, Enomem, Frame, RaiiFrame, RmmA, RmmArch, PAGE_SIZE, ++ allocate_p2frame, deallocate_p2frame, Enomem, Frame, PhysicalAddress, RaiiFrame, RmmA, ++ RmmArch, PAGE_SIZE, + }, + percpu::PercpuBlock, + scheme::{CallerCtx, FileHandle, SchemeId}, +@@ -62,6 +63,38 @@ impl Status { + } + } + ++pub const SCHED_PRIORITY_LEVELS: usize = 40; ++pub const DEFAULT_SCHED_OTHER_PRIORITY: usize = 20; ++pub const DEFAULT_SCHED_RR_QUANTUM: u128 = 100_000_000; ++ ++#[repr(u8)] ++#[derive(Clone, Copy, Debug, PartialEq, Eq)] ++pub enum SchedPolicy { ++ Fifo = 0, ++ RoundRobin = 1, ++ Other = 2, ++} ++ ++impl SchedPolicy { ++ pub fn try_from_raw(raw: u8) -> Option { ++ match raw { ++ 0 => Some(Self::Fifo), ++ 1 => Some(Self::RoundRobin), ++ 2 => Some(Self::Other), ++ _ => None, ++ } ++ } ++} ++ ++pub fn rt_priority_to_kernel_prio(rt_priority: u8) -> usize { ++ (SCHED_PRIORITY_LEVELS - 1) ++ .saturating_sub((usize::from(rt_priority.min(99)) * (SCHED_PRIORITY_LEVELS - 1)) / 99) ++} ++ ++fn clamp_sched_other_prio(prio: usize) -> usize { ++ prio.min(SCHED_PRIORITY_LEVELS - 1) ++} ++ + #[derive(Clone, Debug)] + pub enum HardBlockedReason { + /// "SIGSTOP", only procmgr is allowed to switch contexts this state +@@ -96,6 +129,7 @@ pub struct Context { + pub running: bool, + /// Current CPU ID + pub cpu_id: Option, ++ pub last_cpu: Option, + /// Time this context was switched to + pub switch_time: u128, + /// Amount of CPU time used +@@ -140,6 +174,20 @@ pub struct Context { + pub fmap_ret: Option, + /// Priority + pub prio: usize, ++ pub sched_policy: SchedPolicy, ++ pub sched_rt_priority: u8, ++ pub sched_rr_ticks_consumed: u32, ++ pub sched_static_prio: usize, ++pub sched_rr_quantum: u128, ++ /// Virtual runtime for SCHED_OTHER fair scheduling. ++ /// CPU-bound threads accumulate vruntime faster; I/O-bound stay lower. ++ pub vruntime: u128, ++ #[allow(dead_code)] ++ pub futex_pi_boost: bool, ++ #[allow(dead_code)] ++ pub futex_pi_original_prio: usize, ++ #[allow(dead_code)] ++ pub futex_pi_waiters: Vec, + + // TODO: id can reappear after wraparound? + pub owner_proc_id: Option, +@@ -148,6 +196,8 @@ pub struct Context { + pub euid: u32, + pub egid: u32, + pub pid: usize, ++ /// Supplementary group IDs for access control decisions. ++ pub groups: Vec, + + // See [`PreemptGuard`] + // +@@ -182,6 +232,7 @@ impl Context { + status_reason: "", + running: false, + cpu_id: None, ++ last_cpu: None, + switch_time: 0, + cpu_time: 0, + sched_affinity: LogicalCpuSet::all(), +@@ -197,13 +248,23 @@ impl Context { + files: Arc::new(RwLock::new(FdTbl::new())), + userspace: false, + fmap_ret: None, +- prio: 20, ++ prio: DEFAULT_SCHED_OTHER_PRIORITY, ++ sched_policy: SchedPolicy::Other, ++ sched_rt_priority: 0, ++ sched_rr_ticks_consumed: 0, ++ sched_static_prio: DEFAULT_SCHED_OTHER_PRIORITY, ++ sched_rr_quantum: DEFAULT_SCHED_RR_QUANTUM, ++ vruntime: 0u128, ++ futex_pi_boost: false, ++ futex_pi_original_prio: DEFAULT_SCHED_OTHER_PRIORITY, ++ futex_pi_waiters: Vec::new(), + being_sigkilled: false, + owner_proc_id, + + euid: 0, + egid: 0, + pid: 0, ++ groups: Vec::new(), + + #[cfg(feature = "syscall_debug")] + syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(), +@@ -218,11 +279,47 @@ impl Context { + self.preempt_locks == 0 + } + ++ fn base_sched_prio(&self) -> usize { ++ match self.sched_policy { ++ SchedPolicy::Other => clamp_sched_other_prio(self.sched_static_prio), ++ SchedPolicy::Fifo | SchedPolicy::RoundRobin => { ++ rt_priority_to_kernel_prio(self.sched_rt_priority) ++ } ++ } ++ } ++ ++ fn apply_sched_prio(&mut self) { ++ let base_prio = self.base_sched_prio(); ++ if self.futex_pi_boost { ++ self.futex_pi_original_prio = base_prio; ++ self.prio = self.prio.min(base_prio); ++ } else { ++ self.futex_pi_original_prio = base_prio; ++ self.prio = base_prio; ++ } ++ } ++ ++ pub fn set_sched_other_prio(&mut self, prio: usize) { ++ self.sched_static_prio = clamp_sched_other_prio(prio); ++ self.apply_sched_prio(); ++ } ++ ++ pub fn set_sched_policy(&mut self, sched_policy: SchedPolicy, rt_priority: u8) { ++ self.sched_policy = sched_policy; ++ self.sched_rt_priority = match sched_policy { ++ SchedPolicy::Other => 0, ++ SchedPolicy::Fifo | SchedPolicy::RoundRobin => rt_priority.min(99), ++ }; ++ self.sched_rr_ticks_consumed = 0; ++ self.apply_sched_prio(); ++ } ++ + /// Block the context, and return true if it was runnable before being blocked + pub fn block(&mut self, reason: &'static str) -> bool { + if self.status.is_runnable() { + self.status = Status::Blocked; + self.status_reason = reason; ++ self.sched_rr_ticks_consumed = 0; + true + } else { + false +@@ -232,6 +329,7 @@ impl Context { + pub fn hard_block(&mut self, reason: HardBlockedReason) -> bool { + if self.status.is_runnable() { + self.status = Status::HardBlocked { reason }; ++ self.sched_rr_ticks_consumed = 0; + + true + } else { +@@ -261,6 +359,7 @@ impl Context { + if self.status.is_soft_blocked() { + self.status = Status::Runnable; + self.status_reason = ""; ++ self.sched_rr_ticks_consumed = 0; + + true + } else { +@@ -479,6 +578,7 @@ impl Context { + uid: self.euid, + gid: self.egid, + pid: self.pid, ++ groups: self.groups.clone(), + } + } + } diff --git a/local/patches/kernel/P7-cache-affine-switch.patch b/local/patches/kernel/P7-cache-affine-switch.patch new file mode 100644 index 00000000..8ea0b258 --- /dev/null +++ b/local/patches/kernel/P7-cache-affine-switch.patch @@ -0,0 +1,225 @@ +diff --git a/src/context/switch.rs b/src/context/switch.rs +index 86684c8..cd5f7ed 100644 +--- a/src/context/switch.rs ++++ b/src/context/switch.rs +@@ -5,7 +5,7 @@ + use crate::{ + context::{ + self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard, +- Context, ContextLock, WeakContextRef, ++ Context, ContextLock, SchedPolicy, WeakContextRef, + }, + cpu_set::LogicalCpuId, + cpu_stats::{self, CpuState}, +@@ -33,35 +33,17 @@ const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [ + 70, 56, 45, 36, 29, 23, 18, 15, + ]; + +-/// Determines if a given context is eligible to be scheduled on a given CPU (in +-/// principle, the current CPU). +-/// +-/// # Safety +-/// This function is unsafe because it modifies the `context`'s state directly without synchronization. +-/// +-/// # Parameters +-/// - `context`: The context (process/thread) to be checked. +-/// - `cpu_id`: The logical ID of the CPU on which the context is being scheduled. +-/// +-/// # Returns +-/// - `UpdateResult::CanSwitch`: If the context can be switched to. +-/// - `UpdateResult::Skip`: If the context should be skipped (e.g., it's running on another CPU). + unsafe fn update_runnable( + context: &mut Context, + cpu_id: LogicalCpuId, + switch_time: u128, + ) -> UpdateResult { +- // Ignore contexts that are already running. + if context.running { + return UpdateResult::Skip; + } +- +- // Ignore contexts assigned to other CPUs. + if !context.sched_affinity.contains(cpu_id) { + return UpdateResult::Skip; + } +- +- // If context is soft-blocked and has a wake-up time, check if it should wake up. + if context.status.is_soft_blocked() + && let Some(wake) = context.wake + && switch_time >= wake +@@ -69,8 +51,6 @@ unsafe fn update_runnable( + context.wake = None; + context.unblock_no_ipi(); + } +- +- // If the context is runnable, indicate it can be switched to. + if context.status.is_runnable() { + UpdateResult::CanSwitch + } else { +@@ -95,7 +75,7 @@ pub fn tick(token: &mut CleanLockToken) { + let new_ticks = ticks_cell.get() + 1; + ticks_cell.set(new_ticks); + +- // Trigger a context switch after every 3 ticks (approx. 6.75 ms). ++ // Trigger a context switch after every 3 ticks. + if new_ticks >= 3 { + switch(token); + crate::context::signal::signal_handler(token); +@@ -167,10 +147,7 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + let mut prev_context_guard = unsafe { prev_context_lock.write_arc() }; + + if !prev_context_guard.is_preemptable() { +- // Unset global lock + arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); +- +- // Pretend to have finished switching, so CPU is not idled + return SwitchResult::Switched; + } + +@@ -213,6 +190,7 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + + // Set the previous context as "not running" + prev_context.running = false; ++ prev_context.last_cpu = prev_context.cpu_id; + + // Set the next context as "running" + next_context.running = true; +@@ -222,6 +200,13 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + // Update times + if !was_idle { + prev_context.cpu_time += switch_time.saturating_sub(prev_context.switch_time); ++ if prev_context.sched_policy == SchedPolicy::Other { ++ let actual_ns = switch_time.saturating_sub(prev_context.switch_time); ++ let weight = SCHED_PRIO_TO_WEIGHT[prev_context.sched_static_prio.min(39)] as u128; ++ let default_weight = SCHED_PRIO_TO_WEIGHT[20] as u128; ++ let delta = actual_ns.saturating_mul(default_weight) / weight.max(1); ++ prev_context.vruntime = prev_context.vruntime.saturating_add(delta); ++ } + } + next_context.switch_time = switch_time; + if next_context.userspace { +@@ -377,6 +362,124 @@ fn select_next_context( + let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum(); + let mut skipped_contexts = 0; + ++ // PASS 0: SCHED_FIFO and SCHED_RR — scan for RT contexts to schedule. ++ // When a runnable RT context is found, it takes priority over all SCHED_OTHER. ++ for prio in 0..40 { ++ let rt_contexts = contexts_list ++ .get_mut(prio) ++ .expect("prio should be between [0, 39]"); ++ let len = rt_contexts.len(); ++ for _ in 0..len { ++ let (rt_ref, rt_lock) = match rt_contexts.pop_front() { ++ Some(lock) => match lock.upgrade() { ++ Some(l) => (lock, l), ++ None => { ++ skipped_contexts += 1; ++ continue; ++ } ++ }, ++ None => break, ++ }; ++ if Arc::ptr_eq(&rt_lock, &idle_context) { ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ // Current RT thread: if runnable with no higher-prio RT found yet, ++ // keep it running (no demotion to SCHED_OTHER) ++ if Arc::ptr_eq(&rt_lock, &prev_context_lock) { ++ let rt_guard = unsafe { rt_lock.write_arc() }; ++ if rt_guard.status.is_runnable() ++ && (rt_guard.sched_policy == SchedPolicy::Fifo ++ || rt_guard.sched_policy == SchedPolicy::RoundRobin) ++ { ++ percpu.balance.set(balance); ++ percpu.last_queue.set(i); ++ return Ok(Some(rt_guard)); ++ } ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ let rt_guard = unsafe { rt_lock.write_arc() }; ++ if !rt_guard.status.is_runnable() || rt_guard.running ++ || !rt_guard.sched_affinity.contains(cpu_id) ++ { ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ if rt_guard.sched_policy == SchedPolicy::Fifo ++ || rt_guard.sched_policy == SchedPolicy::RoundRobin ++ { ++ percpu.balance.set(balance); ++ percpu.last_queue.set(i); ++ if !Arc::ptr_eq(&prev_context_lock, &idle_context) { ++ let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock)); ++ if prev_context_guard.status.is_runnable() { ++ contexts_list[prev_context_guard.prio].push_back(prev_ctx); ++ } else { ++ idle_contexts(token.token()).push_back(prev_ctx); ++ } ++ } ++ return Ok(Some(rt_guard)); ++ } ++ rt_contexts.push_back(rt_ref); ++ } ++ } ++ ++ // PASS 1: SCHED_OTHER — minimum-vruntime selection ++ { ++ let mut min_vruntime = u128::MAX; ++ let mut best: Option<(usize, WeakContextRef)> = None; ++ for (prio, queue) in contexts_list.iter().enumerate() { ++ for ctx_ref in queue.iter() { ++ if let Some(ctx_lock) = ctx_ref.upgrade() { ++ if Arc::ptr_eq(&ctx_lock, &prev_context_lock) || Arc::ptr_eq(&ctx_lock, &idle_context) { ++ continue; ++ } ++ if let Some(guard) = ctx_lock.try_read(token.token()) { ++ if guard.status.is_runnable() && !guard.running ++ && guard.sched_affinity.contains(cpu_id) ++ && guard.sched_policy == SchedPolicy::Other ++ { ++ let mut v = guard.vruntime; ++ if guard.last_cpu == Some(cpu_id) { ++ v = v.saturating_sub(v / 8); ++ } ++ drop(guard); ++ if v < min_vruntime { ++ min_vruntime = v; ++ best = Some((prio, ctx_ref.clone())); ++ } ++ } ++ } ++ } ++ } ++ } ++ if let Some((best_prio, ctx_ref)) = best { ++ { ++ let queue = contexts_list.get_mut(best_prio).expect("valid prio"); ++ queue.retain(|r| !WeakContextRef::eq(r, &ctx_ref)); ++ } ++ if let Some(ctx_lock) = ctx_ref.upgrade() { ++ let guard = unsafe { ctx_lock.write_arc() }; ++ if guard.status.is_runnable() { ++ percpu.balance.set(balance); ++ percpu.last_queue.set(i); ++ if !Arc::ptr_eq(&prev_context_lock, &idle_context) { ++ let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock)); ++ if prev_context_guard.status.is_runnable() { ++ contexts_list[prev_context_guard.prio].push_back(prev_ctx); ++ } else { ++ idle_contexts(token.token()).push_back(prev_ctx); ++ } ++ } ++ return Ok(Some(guard)); ++ } ++ } ++ } ++ } ++ ++ // PASS 2: fallback DWRR deficit tracking ++ + 'priority: loop { + i = (i + 1) % 40; + total_iters += 1; diff --git a/local/patches/kernel/P7-proc-setname.patch b/local/patches/kernel/P7-proc-setname.patch new file mode 100644 index 00000000..70821737 --- /dev/null +++ b/local/patches/kernel/P7-proc-setname.patch @@ -0,0 +1,47 @@ +diff --git a/src/scheme/proc.rs b/src/scheme/proc.rs +--- a/src/scheme/proc.rs ++++ b/src/scheme/proc.rs +@@ -147,6 +147,7 @@ enum ContextHandle { + Priority, + SchedAffinity, + SchedPolicy, ++ Name, + + MmapMinAddr(Arc), + } +@@ -267,6 +268,7 @@ impl ProcScheme { + "sched-affinity" => (ContextHandle::SchedAffinity, true), + // TODO: Switch this kernel-local proc handle over to a stable upstream + // redox_syscall ProcCall::SetSchedPolicy opcode once that lands. + "sched-policy" => (ContextHandle::SchedPolicy, false), ++ "name" => (ContextHandle::Name, false), + "status" => (ContextHandle::Status { privileged: false }, false), + _ if path.starts_with("auth-") => { + let nonprefix = &path["auth-".len()..]; +@@ -1218,6 +1220,16 @@ impl ContextHandle { + Ok(2) + } ++ ContextHandle::Name => { ++ let mut name_buf = [0u8; 32]; ++ let len = buf.copy_common_bytes_to_slice(&mut name_buf[..31]).unwrap_or(0); ++ let mut context = context.write(token.token()); ++ context.name.clear(); ++ if let Ok(s) = core::str::from_utf8(&name_buf[..len]) { ++ context.name.push_str(s); ++ } ++ Ok(len) ++ } + ContextHandle::Status { privileged } => { + let mut args = buf.usizes(); + +@@ -1532,6 +1544,10 @@ impl ContextHandle { + let data = [context.sched_policy as u8, context.sched_rt_priority]; + buf.copy_common_bytes_from_slice(&data) + } ++ ContextHandle::Name => { ++ let context = context.read(token.token()); ++ buf.copy_common_bytes_from_slice(context.name.as_bytes()) ++ } + ContextHandle::Status { .. } => { + let status = { + let context = context.read(token.token()); diff --git a/local/patches/kernel/P7-proc-setpriority.patch b/local/patches/kernel/P7-proc-setpriority.patch new file mode 100644 index 00000000..e65a95bd --- /dev/null +++ b/local/patches/kernel/P7-proc-setpriority.patch @@ -0,0 +1,70 @@ +diff --git a/src/scheme/proc.rs b/src/scheme/proc.rs +--- a/src/scheme/proc.rs ++++ b/src/scheme/proc.rs +@@ -145,8 +145,9 @@ enum ContextHandle { + // TODO: Remove this once openat is implemented, or allow openat-via-dup via e.g. the top-level + // directory. + OpenViaDup, ++ Priority, + SchedAffinity, + SchedPolicy, + Name, + + MmapMinAddr(Arc), +@@ -160,6 +161,17 @@ pub struct ProcScheme; + static NEXT_ID: AtomicUsize = AtomicUsize::new(1); + static HANDLES: RwLock> = + RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); ++ ++const NICE_MIN: i32 = -20; ++const NICE_MAX: i32 = 19; ++ ++fn nice_to_kernel_prio(nice: i32) -> usize { ++ (nice.saturating_add(20)).clamp(0, 39) as usize ++} ++ ++fn kernel_prio_to_nice(prio: usize) -> i32 { ++ (prio.min(39) as i32) - 20 ++} + + #[cfg(feature = "debugger")] + #[allow(dead_code)] + pub fn foreach_addrsp( +@@ -253,6 +265,7 @@ impl ProcScheme { + "sighandler" => (ContextHandle::Sighandler, false), + "start" => (ContextHandle::Start, false), + "open_via_dup" => (ContextHandle::OpenViaDup, false), ++ "priority" => (ContextHandle::Priority, false), + "mmap-min-addr" => ( + ContextHandle::MmapMinAddr(Arc::clone( + context +@@ -1191,6 +1204,17 @@ impl ContextHandle { + + Ok(size_of_val(&mask)) + } ++ Self::Priority => { ++ let nice = unsafe { buf.read_exact::()? }; ++ if !(NICE_MIN..=NICE_MAX).contains(&nice) { ++ return Err(Error::new(EINVAL)); ++ } ++ ++ context ++ .write(token.token()) ++ .set_sched_other_prio(nice_to_kernel_prio(nice)); ++ ++ Ok(size_of::()) ++ } + Self::SchedPolicy => { + if buf.len() != 2 { + return Err(Error::new(EINVAL)); +@@ -1522,6 +1546,10 @@ impl ContextHandle { + + buf.copy_exactly(crate::cpu_set::mask_as_bytes(&mask))?; + Ok(size_of_val(&mask)) ++ } ++ ContextHandle::Priority => { ++ let nice = kernel_prio_to_nice(context.read(token.token()).prio); ++ buf.copy_common_bytes_from_slice(&nice.to_ne_bytes()) + } + ContextHandle::SchedPolicy => { + let context = context.read(token.token()); diff --git a/local/patches/kernel/P8-futex-pi.patch b/local/patches/kernel/P8-futex-pi.patch new file mode 100644 index 00000000..b1c3ab45 --- /dev/null +++ b/local/patches/kernel/P8-futex-pi.patch @@ -0,0 +1,364 @@ +diff --git a/src/syscall/futex.rs b/src/syscall/futex.rs +--- a/src/syscall/futex.rs ++++ b/src/syscall/futex.rs +@@ +-use crate::syscall::{ +- data::TimeSpec, +- error::{Error, Result, EAGAIN, EFAULT, EINVAL, ETIMEDOUT}, +- flag::{FUTEX_REQUEUE, FUTEX_WAIT, FUTEX_WAIT64, FUTEX_WAKE}, +-}; ++use crate::syscall::{ ++ data::TimeSpec, ++ error::{Error, Result, EAGAIN, EDEADLK, EFAULT, EINVAL, EPERM, ETIMEDOUT}, ++ flag::{FUTEX_REQUEUE, FUTEX_WAIT, FUTEX_WAIT64, FUTEX_WAKE}, ++}; ++ ++const FUTEX_LOCK_PI: usize = 6; ++const FUTEX_UNLOCK_PI: usize = 7; ++const FUTEX_TRYLOCK_PI: usize = 8; ++ ++const FUTEX_WAITERS: u32 = 0x8000_0000; ++const FUTEX_OWNER_DIED: u32 = 0x4000_0000; ++const FUTEX_TID_MASK: u32 = 0x3FFF_FFFF; +@@ +-type FutexList = HashMap>; ++type FutexList = HashMap; ++ ++#[derive(Clone, Copy, Debug, Eq, PartialEq)] ++enum FutexWaitKind { ++ Regular, ++ PriorityInheritance, ++} ++ ++#[derive(Default)] ++struct FutexQueue { ++ waiters: Vec, ++ pi_owner: Option>, ++} ++ ++impl FutexQueue { ++ fn is_empty(&self) -> bool { ++ self.waiters.is_empty() && self.pi_owner.is_none() ++ } ++} +@@ + pub struct FutexEntry { +@@ + // address space to check against if virt matches but not phys + addr_space: Weak, ++ kind: FutexWaitKind, + } +@@ ++fn context_futex_tid(context: &crate::context::Context) -> u32 { ++ let tid = u32::try_from(context.pid).unwrap_or(context.debug_id) & FUTEX_TID_MASK; ++ if tid == 0 { ++ context.debug_id & FUTEX_TID_MASK ++ } else { ++ tid ++ } ++} ++ ++fn current_context_futex_tid(context_lock: &Arc, token: &mut CleanLockToken) -> u32 { ++ let context = context_lock.read(token.token()); ++ context_futex_tid(&context) ++} ++ ++fn push_owner_waiter(owner: &mut crate::context::Context, phys: PhysicalAddress) { ++ if !owner.futex_pi_waiters.iter().any(|waiter| *waiter == phys) { ++ owner.futex_pi_waiters.push(phys); ++ } ++} ++ ++fn pop_owner_waiter(owner: &mut crate::context::Context, phys: PhysicalAddress) { ++ owner.futex_pi_waiters.retain(|waiter| *waiter != phys); ++} ++ ++fn boost_pi_owner( ++ owner_lock: &Arc, ++ waiter_prio: usize, ++ phys: PhysicalAddress, ++ token: &mut crate::sync::LockToken<'_, L1>, ++) { ++ let mut owner = owner_lock.write(token.token()); ++ push_owner_waiter(&mut owner, phys); ++ if owner.prio > waiter_prio { ++ if !owner.futex_pi_boost { ++ owner.futex_pi_original_prio = owner.prio; ++ } ++ owner.futex_pi_boost = true; ++ owner.prio = owner.prio.min(waiter_prio); ++ } ++} ++ ++fn restore_pi_owner(owner: &mut crate::context::Context, phys: PhysicalAddress) { ++ pop_owner_waiter(owner, phys); ++ if owner.futex_pi_boost && owner.futex_pi_waiters.is_empty() { ++ owner.futex_pi_boost = false; ++ owner.prio = owner.futex_pi_original_prio; ++ } ++} ++ ++fn queue_waiter( ++ queue: &mut FutexQueue, ++ target_virtaddr: VirtualAddress, ++ context_lock: &Arc, ++ addr_space: &Arc, ++ kind: FutexWaitKind, ++) { ++ queue.waiters.push(FutexEntry { ++ target_virtaddr, ++ context_lock: Arc::clone(context_lock), ++ addr_space: Arc::downgrade(addr_space), ++ kind, ++ }); ++} +@@ +- futexes +- .entry(locked_physaddr) +- .or_insert_with(Vec::new) +- .push(FutexEntry { +- target_virtaddr, +- context_lock: context_lock.clone(), +- addr_space: Arc::downgrade(¤t_addrsp), +- }); ++ let queue = futexes.entry(locked_physaddr).or_insert_with(FutexQueue::default); ++ queue_waiter( ++ queue, ++ target_virtaddr, ++ &context_lock, ++ ¤t_addrsp, ++ FutexWaitKind::Regular, ++ ); +@@ +- let remove_queue = if let Some(futexes) = futexes_map.get_mut(&target_physaddr) { +- let mut i = 0; +- let current_addrsp_weak = Arc::downgrade(¤t_addrsp); +- while i < futexes.len() && woken < val { +- let futex = unsafe { futexes.get_unchecked_mut(i) }; +- if futex.target_virtaddr != target_virtaddr +- || !current_addrsp_weak.ptr_eq(&futex.addr_space) +- { +- i += 1; +- continue; +- } +- futex.context_lock.write(futex_token.token()).unblock(); +- futexes.swap_remove(i); +- woken += 1; +- } +- futexes.is_empty() ++ let remove_queue = if let Some(queue) = futexes_map.get_mut(&target_physaddr) { ++ let mut i = 0; ++ let current_addrsp_weak = Arc::downgrade(¤t_addrsp); ++ while i < queue.waiters.len() && woken < val { ++ let waiter = match queue.waiters.get(i) { ++ Some(waiter) => waiter, ++ None => break, ++ }; ++ if waiter.kind != FutexWaitKind::Regular ++ || waiter.target_virtaddr != target_virtaddr ++ || !current_addrsp_weak.ptr_eq(&waiter.addr_space) ++ { ++ i += 1; ++ continue; ++ } ++ let waiter = queue.waiters.swap_remove(i); ++ waiter.context_lock.write(futex_token.token()).unblock(); ++ woken += 1; ++ } ++ queue.is_empty() + } else { + false + }; +@@ +- let mut source_waiters = source_map.remove(&locked_source_physaddr).unwrap_or_default(); ++ let mut source_queue = source_map.remove(&locked_source_physaddr).unwrap_or_default(); +@@ +- total_woken = wake_from(&mut source_waiters, val, &mut futex_token); ++ total_woken = wake_from(&mut source_queue.waiters, val, &mut futex_token); +@@ +- let mut target_waiters = target_map.remove(&locked_target_physaddr).unwrap_or_default(); +- let mut i = 0; +- while i < source_waiters.len() && total_requeued < val2 { +- let should_move = source_waiters ++ let mut target_queue = target_map.remove(&locked_target_physaddr).unwrap_or_default(); ++ let mut i = 0; ++ while i < source_queue.waiters.len() && total_requeued < val2 { ++ let should_move = source_queue ++ .waiters + .get(i) + .map(|waiter| { +- waiter.target_virtaddr == target_virtaddr ++ waiter.kind == FutexWaitKind::Regular ++ && waiter.target_virtaddr == target_virtaddr + && current_addrsp_weak.ptr_eq(&waiter.addr_space) + }) + .unwrap_or(false); +@@ +- let mut waiter = source_waiters.swap_remove(i); +- waiter.target_virtaddr = target2_virtaddr; +- target_waiters.push(waiter); ++ let mut waiter = source_queue.waiters.swap_remove(i); ++ waiter.target_virtaddr = target2_virtaddr; ++ target_queue.waiters.push(waiter); + total_requeued += 1; + } +- if !target_waiters.is_empty() { +- target_map.insert(locked_target_physaddr, target_waiters); ++ if !target_queue.is_empty() { ++ target_map.insert(locked_target_physaddr, target_queue); + } +@@ +- if !source_waiters.is_empty() { +- source_map.insert(locked_source_physaddr, source_waiters); ++ if !source_queue.is_empty() { ++ source_map.insert(locked_source_physaddr, source_queue); + } +@@ ++ FUTEX_LOCK_PI | FUTEX_TRYLOCK_PI => { ++ let _ = validate_futex_u32_addr(addr)?; ++ let context_lock = context::current(); ++ let current_tid = current_context_futex_tid(&context_lock, token); ++ let current_prio = context_lock.read(token.token()).prio; ++ ++ loop { ++ let outcome = { ++ let shard = futex_shard(target_physaddr); ++ let mut futexes = FUTEXES[shard].lock(token.token()); ++ let (futexes, mut futex_token) = futexes.token_split(); ++ let addr_space_guard = current_addrsp.acquire_read(futex_token.downgrade()); ++ let locked_physaddr = validate_and_translate_virt(&addr_space_guard, target_virtaddr) ++ .ok_or(Error::new(EFAULT))?; ++ if locked_physaddr != target_physaddr { ++ None ++ } else { ++ drop(addr_space_guard); ++ let futex_atomic = futex_atomic_u32(locked_physaddr); ++ let mut current = futex_atomic.load(Ordering::SeqCst); ++ loop { ++ let owner_tid = current & FUTEX_TID_MASK; ++ let queue = futexes.entry(locked_physaddr).or_insert_with(FutexQueue::default); ++ let desired_waiters = if queue.waiters.is_empty() { 0 } else { FUTEX_WAITERS }; ++ ++ if owner_tid == 0 { ++ let desired = current_tid | desired_waiters; ++ match futex_atomic.compare_exchange(current, desired, Ordering::SeqCst, Ordering::SeqCst) { ++ Ok(_) => { ++ queue.pi_owner = Some(Arc::downgrade(&context_lock)); ++ break Some(Ok(Ok(0))); ++ } ++ Err(actual) => current = actual, ++ } ++ continue; ++ } ++ ++ if owner_tid == current_tid { ++ break Some(Ok(Err(Error::new(EDEADLK)))); ++ } ++ ++ if op == FUTEX_TRYLOCK_PI { ++ break Some(Ok(Err(Error::new(EAGAIN)))); ++ } ++ ++ if let Some(owner_lock) = queue.pi_owner.as_ref().and_then(Weak::upgrade) { ++ boost_pi_owner(&owner_lock, current_prio, locked_physaddr, &mut futex_token); ++ } ++ ++ { ++ let mut context = context_lock.write(futex_token.token()); ++ if let Some((tctl, pctl, _)) = context.sigcontrol() ++ && tctl.currently_pending_unblocked(pctl) != 0 ++ { ++ break Some(Ok(Err(Error::new(EINTR)))); ++ } ++ context.wake = None; ++ context.block("futex_pi"); ++ } ++ ++ queue_waiter( ++ queue, ++ target_virtaddr, ++ &context_lock, ++ ¤t_addrsp, ++ FutexWaitKind::PriorityInheritance, ++ ); ++ futex_atomic.fetch_or(FUTEX_WAITERS, Ordering::SeqCst); ++ break Some(Ok(Ok(1))); ++ } ++ } ++ }; ++ ++ match outcome { ++ None => continue, ++ Some(Ok(Ok(0))) => return Ok(0), ++ Some(Ok(Ok(_))) => context::switch(token), ++ Some(Ok(Err(err))) => return Err(err), ++ Some(Err(err)) => return Err(err), ++ } ++ } ++ } ++ FUTEX_UNLOCK_PI => { ++ let _ = validate_futex_u32_addr(addr)?; ++ let context_lock = context::current(); ++ let current_tid = current_context_futex_tid(&context_lock, token); ++ let shard = futex_shard(target_physaddr); ++ let current_addrsp_weak = Arc::downgrade(¤t_addrsp); ++ ++ let unlocked = { ++ let mut futexes = FUTEXES[shard].lock(token.token()); ++ let (futexes, mut futex_token) = futexes.token_split(); ++ let addr_space_guard = current_addrsp.acquire_read(futex_token.downgrade()); ++ let locked_physaddr = validate_and_translate_virt(&addr_space_guard, target_virtaddr) ++ .ok_or(Error::new(EFAULT))?; ++ if locked_physaddr != target_physaddr { ++ return Err(Error::new(EAGAIN)); ++ } ++ drop(addr_space_guard); ++ ++ let futex_atomic = futex_atomic_u32(locked_physaddr); ++ let current = futex_atomic.load(Ordering::SeqCst); ++ if (current & FUTEX_TID_MASK) != current_tid { ++ return Err(Error::new(EPERM)); ++ } ++ ++ let mut wake_one = None; ++ let mut new = current & !(FUTEX_TID_MASK | FUTEX_OWNER_DIED); ++ if let Some(queue) = futexes.get_mut(&locked_physaddr) { ++ queue.pi_owner = None; ++ let mut best = None; ++ for (idx, waiter) in queue.waiters.iter().enumerate() { ++ if waiter.kind != FutexWaitKind::PriorityInheritance ++ || waiter.target_virtaddr != target_virtaddr ++ || !current_addrsp_weak.ptr_eq(&waiter.addr_space) ++ { ++ continue; ++ } ++ let prio = waiter.context_lock.read(futex_token.token()).prio; ++ match best { ++ Some((_, best_prio)) if prio >= best_prio => {} ++ _ => best = Some((idx, prio)), ++ } ++ } ++ if let Some((waiter_idx, _)) = best { ++ wake_one = Some(queue.waiters.swap_remove(waiter_idx)); ++ } ++ if !queue.waiters.is_empty() { ++ new |= FUTEX_WAITERS; ++ } ++ } ++ ++ futex_atomic.store(new, Ordering::SeqCst); ++ { ++ let mut context = context_lock.write(futex_token.token()); ++ restore_pi_owner(&mut context, locked_physaddr); ++ } ++ if let Some(waiter) = wake_one { ++ waiter.context_lock.write(futex_token.token()).unblock(); ++ } ++ true ++ }; ++ ++ Ok(usize::from(unlocked)) ++ } + _ => Err(Error::new(EINVAL)), + } + } diff --git a/local/patches/kernel/P8-futex-requeue.patch b/local/patches/kernel/P8-futex-requeue.patch new file mode 100644 index 00000000..768b7628 --- /dev/null +++ b/local/patches/kernel/P8-futex-requeue.patch @@ -0,0 +1,282 @@ +diff --git a/src/syscall/debug.rs b/src/syscall/debug.rs +--- a/src/syscall/debug.rs ++++ b/src/syscall/debug.rs +@@ +- SYS_FUTEX => format!( +- "futex({:#X} [{:?}], {}, {}, {}, {})", ++ SYS_FUTEX => format!( ++ "futex({:#X} [{:?}], {}, {}, {}, {}, {})", + b, + UserSlice::ro(b, 4).and_then(|buf| buf.read_u32()), + c, + d, + e, +- f ++ f, ++ g, + ), +diff --git a/src/syscall/futex.rs b/src/syscall/futex.rs +--- a/src/syscall/futex.rs ++++ b/src/syscall/futex.rs +@@ +-use crate::syscall::{ +- data::TimeSpec, +- error::{Error, Result, EAGAIN, EFAULT, EINVAL, ETIMEDOUT}, +- flag::{FUTEX_WAIT, FUTEX_WAIT64, FUTEX_WAKE}, +-}; ++use crate::syscall::{ ++ data::TimeSpec, ++ error::{Error, Result, EAGAIN, EFAULT, EINVAL, ETIMEDOUT}, ++ flag::{FUTEX_REQUEUE, FUTEX_WAIT, FUTEX_WAIT64, FUTEX_WAKE}, ++}; ++ ++const FUTEX_CMP_REQUEUE: usize = 4; +@@ + pub struct FutexEntry { +@@ + } ++ ++fn validate_futex_u32_addr(addr: usize) -> Result { ++ if !addr.is_multiple_of(4) { ++ return Err(Error::new(EINVAL)); ++ } ++ Ok(VirtualAddress::new(addr)) ++} ++ ++fn lock_futex_pair( ++ first_shard: usize, ++ second_shard: usize, ++ token: &mut CleanLockToken, ++ f: impl FnOnce(&mut FutexList, Option<&mut FutexList>, crate::sync::LockToken<'_, L1>) -> R, ++) -> R { ++ if first_shard == second_shard { ++ let mut guard = FUTEXES[first_shard].lock(token.token()); ++ let (map, map_token) = guard.token_split(); ++ return f(map, None, map_token); ++ } ++ ++ let low = core::cmp::min(first_shard, second_shard); ++ let high = core::cmp::max(first_shard, second_shard); ++ ++ let mut low_guard = FUTEXES[low].lock(token.token()); ++ let (low_map, low_token) = low_guard.token_split(); ++ let mut high_guard = unsafe { FUTEXES[high].relock(low_token) }; ++ let (high_map, high_token) = high_guard.token_split(); ++ ++ if first_shard == low { ++ f(low_map, Some(high_map), high_token) ++ } else { ++ f(high_map, Some(low_map), high_token) ++ } ++} +@@ +-pub fn futex( +- addr: usize, +- op: usize, +- val: usize, +- val2: usize, +- _addr2: usize, +- token: &mut CleanLockToken, +-) -> Result { ++pub fn futex( ++ addr: usize, ++ op: usize, ++ val: usize, ++ val2: usize, ++ addr2: usize, ++ val3: usize, ++ token: &mut CleanLockToken, ++) -> Result { +@@ +- { +- // TODO: Lock ordering violation +- let mut token = unsafe { CleanLockToken::new() }; +- let mut futexes = FUTEXES[futex_shard(target_physaddr)].lock(token.token()); +- let (futexes, mut token) = futexes.token_split(); ++ loop { ++ let shard = futex_shard(target_physaddr); ++ let queued = { ++ let mut futexes = FUTEXES[shard].lock(token.token()); ++ let (futexes, mut futex_token) = futexes.token_split(); ++ let addr_space_guard = current_addrsp.acquire_read(futex_token.downgrade()); ++ let locked_physaddr = validate_and_translate_virt(&addr_space_guard, target_virtaddr) ++ .ok_or(Error::new(EFAULT))?; ++ if locked_physaddr != target_physaddr { ++ false ++ } else { ++ drop(addr_space_guard); +@@ +- futexes +- .entry(target_physaddr) +- .or_insert_with(Vec::new) +- .push(FutexEntry { +- target_virtaddr, +- context_lock: context_lock.clone(), +- addr_space: Arc::downgrade(¤t_addrsp), +- }); +- } ++ futexes ++ .entry(locked_physaddr) ++ .or_insert_with(Vec::new) ++ .push(FutexEntry { ++ target_virtaddr, ++ context_lock: context_lock.clone(), ++ addr_space: Arc::downgrade(¤t_addrsp), ++ }); ++ true ++ } ++ }; ++ ++ if queued { ++ break; ++ } ++ } +@@ +- drop(addr_space_guard); +- + context::switch(token); +@@ + FUTEX_WAKE => { +@@ + Ok(woken) + } ++ FUTEX_REQUEUE | FUTEX_CMP_REQUEUE => { ++ let _ = validate_futex_u32_addr(addr)?; ++ let target2_virtaddr = validate_futex_u32_addr(addr2)?; ++ let target2_physaddr = { ++ let addr_space_guard = current_addrsp.acquire_read(token.downgrade()); ++ validate_and_translate_virt(&addr_space_guard, target2_virtaddr) ++ .ok_or(Error::new(EFAULT))? ++ }; ++ let source_shard = futex_shard(target_physaddr); ++ let target_shard = futex_shard(target2_physaddr); ++ let current_addrsp_weak = Arc::downgrade(¤t_addrsp); ++ ++ let affected = lock_futex_pair( ++ source_shard, ++ target_shard, ++ token, ++ |source_map, target_map_opt, mut futex_token| { ++ let addr_space_guard = current_addrsp.acquire_read(futex_token.downgrade()); ++ let locked_source_physaddr = validate_and_translate_virt(&addr_space_guard, target_virtaddr) ++ .ok_or(Error::new(EFAULT))?; ++ let locked_target_physaddr = validate_and_translate_virt(&addr_space_guard, target2_virtaddr) ++ .ok_or(Error::new(EFAULT))?; ++ drop(addr_space_guard); ++ ++ if locked_source_physaddr != target_physaddr || locked_target_physaddr != target2_physaddr { ++ return Err(Error::new(EAGAIN)); ++ } ++ ++ if op == FUTEX_CMP_REQUEUE { ++ let accessible_addr = crate::memory::RmmA::phys_to_virt(locked_source_physaddr).data(); ++ let current = u64::from(unsafe { ++ (*(accessible_addr as *const AtomicU32)).load(Ordering::SeqCst) ++ }); ++ if current != u64::from(val3 as u32) { ++ return Err(Error::new(EAGAIN)); ++ } ++ } ++ ++ let mut source_waiters = source_map.remove(&locked_source_physaddr).unwrap_or_default(); ++ let mut total_woken = 0; ++ let mut total_requeued = 0; ++ ++ let wake_from = |waiters: &mut Vec, limit: usize, token: &mut crate::sync::LockToken<'_, L1>| { ++ let mut woken = 0; ++ let mut i = 0; ++ while i < waiters.len() && woken < limit { ++ let waiter = match waiters.get(i) { ++ Some(waiter) => waiter, ++ None => break, ++ }; ++ if waiter.target_virtaddr != target_virtaddr || !current_addrsp_weak.ptr_eq(&waiter.addr_space) { ++ i += 1; ++ continue; ++ } ++ let waiter = waiters.swap_remove(i); ++ waiter.context_lock.write(token.token()).unblock(); ++ woken += 1; ++ } ++ woken ++ }; ++ ++ total_woken = wake_from(&mut source_waiters, val, &mut futex_token); ++ ++ if let Some(target_map) = target_map_opt { ++ let mut target_waiters = target_map.remove(&locked_target_physaddr).unwrap_or_default(); ++ let mut i = 0; ++ while i < source_waiters.len() && total_requeued < val2 { ++ let should_move = source_waiters ++ .get(i) ++ .map(|waiter| { ++ waiter.target_virtaddr == target_virtaddr ++ && current_addrsp_weak.ptr_eq(&waiter.addr_space) ++ }) ++ .unwrap_or(false); ++ if !should_move { ++ i += 1; ++ continue; ++ } ++ let mut waiter = source_waiters.swap_remove(i); ++ waiter.target_virtaddr = target2_virtaddr; ++ target_waiters.push(waiter); ++ total_requeued += 1; ++ } ++ if !target_waiters.is_empty() { ++ target_map.insert(locked_target_physaddr, target_waiters); ++ } ++ } else if locked_source_physaddr == locked_target_physaddr { ++ for waiter in source_waiters.iter_mut() { ++ if total_requeued >= val2 { ++ break; ++ } ++ if waiter.target_virtaddr == target_virtaddr && current_addrsp_weak.ptr_eq(&waiter.addr_space) { ++ waiter.target_virtaddr = target2_virtaddr; ++ total_requeued += 1; ++ } ++ } ++ } else { ++ let mut target_waiters = source_map.remove(&locked_target_physaddr).unwrap_or_default(); ++ let mut i = 0; ++ while i < source_waiters.len() && total_requeued < val2 { ++ let should_move = source_waiters ++ .get(i) ++ .map(|waiter| { ++ waiter.target_virtaddr == target_virtaddr ++ && current_addrsp_weak.ptr_eq(&waiter.addr_space) ++ }) ++ .unwrap_or(false); ++ if !should_move { ++ i += 1; ++ continue; ++ } ++ let mut waiter = source_waiters.swap_remove(i); ++ waiter.target_virtaddr = target2_virtaddr; ++ target_waiters.push(waiter); ++ total_requeued += 1; ++ } ++ if !target_waiters.is_empty() { ++ source_map.insert(locked_target_physaddr, target_waiters); ++ } ++ } ++ ++ if !source_waiters.is_empty() { ++ source_map.insert(locked_source_physaddr, source_waiters); ++ } ++ ++ Ok(total_woken + total_requeued) ++ }, ++ )?; ++ ++ Ok(affected) ++ } + _ => Err(Error::new(EINVAL)), + } + } +diff --git a/src/syscall/mod.rs b/src/syscall/mod.rs +--- a/src/syscall/mod.rs ++++ b/src/syscall/mod.rs +@@ +- SYS_FUTEX => futex(b, c, d, e, f, token), ++ SYS_FUTEX => futex(b, c, d, e, f, g, token), diff --git a/local/patches/kernel/P8-futex-robust.patch b/local/patches/kernel/P8-futex-robust.patch new file mode 100644 index 00000000..44e8e978 --- /dev/null +++ b/local/patches/kernel/P8-futex-robust.patch @@ -0,0 +1,264 @@ +diff --git a/src/context/context.rs b/src/context/context.rs +--- a/src/context/context.rs ++++ b/src/context/context.rs +@@ + #[allow(dead_code)] + pub futex_pi_waiters: Vec, ++ pub robust_list_head: Option, +@@ + futex_pi_boost: false, + futex_pi_original_prio: DEFAULT_SCHED_OTHER_PRIORITY, + futex_pi_waiters: Vec::new(), ++ robust_list_head: None, + being_sigkilled: false, +diff --git a/src/syscall/debug.rs b/src/syscall/debug.rs +--- a/src/syscall/debug.rs ++++ b/src/syscall/debug.rs +@@ + use crate::{sync::CleanLockToken, syscall::error::Result}; ++ ++const SYS_SET_ROBUST_LIST: usize = 311; ++const SYS_GET_ROBUST_LIST: usize = 312; +@@ + SYS_FUTEX => format!( + "futex({:#X} [{:?}], {}, {}, {}, {}, {})", +@@ + ), ++ SYS_SET_ROBUST_LIST => format!("set_robust_list({:#X}, {})", b, c), ++ SYS_GET_ROBUST_LIST => format!("get_robust_list({}, {:#X}, {:#X})", b, c, d), + SYS_MKNS => format!( +diff --git a/src/syscall/futex.rs b/src/syscall/futex.rs +--- a/src/syscall/futex.rs ++++ b/src/syscall/futex.rs +@@ +-use crate::syscall::{ +- data::TimeSpec, +- error::{Error, Result, EAGAIN, EDEADLK, EFAULT, EINVAL, EPERM, ETIMEDOUT}, +- flag::{FUTEX_REQUEUE, FUTEX_WAIT, FUTEX_WAIT64, FUTEX_WAKE}, +-}; ++use crate::syscall::{ ++ data::TimeSpec, ++ error::{Error, Result, EAGAIN, EDEADLK, EFAULT, EINVAL, EPERM, ESRCH, ETIMEDOUT}, ++ flag::{FUTEX_REQUEUE, FUTEX_WAIT, FUTEX_WAIT64, FUTEX_WAKE}, ++}; ++ ++use super::usercopy::UserSliceWo; +@@ + const FUTEX_WAITERS: u32 = 0x8000_0000; + const FUTEX_OWNER_DIED: u32 = 0x4000_0000; + const FUTEX_TID_MASK: u32 = 0x3FFF_FFFF; ++ ++const ROBUST_LIST_LIMIT: usize = 2048; ++const ROBUST_LIST_HEAD_SIZE: usize = size_of::(); +@@ + pub struct FutexEntry { +@@ + } ++ ++#[derive(Clone, Copy, Debug)] ++#[repr(C)] ++struct RobustList { ++ next: usize, ++} ++ ++#[derive(Clone, Copy, Debug)] ++#[repr(C)] ++struct RobustListHead { ++ list: RobustList, ++ futex_offset: isize, ++ list_op_pending: usize, ++} +@@ ++fn lookup_robust_list_head(pid: usize, token: &mut CleanLockToken) -> Result<(usize, usize)> { ++ let current = context::current(); ++ { ++ let current_guard = current.read(token.token()); ++ if pid == 0 || current_guard.pid == pid { ++ return Ok((current_guard.robust_list_head.unwrap_or(0), ROBUST_LIST_HEAD_SIZE)); ++ } ++ } ++ ++ let mut token_ref = token.token(); ++ let mut contexts = context::contexts(token_ref.downgrade()); ++ let (contexts, mut contexts_token) = contexts.token_split(); ++ for context_ref in contexts.iter() { ++ let context = context_ref.read(contexts_token.token()); ++ if context.pid == pid { ++ return Ok((context.robust_list_head.unwrap_or(0), ROBUST_LIST_HEAD_SIZE)); ++ } ++ } ++ ++ Err(Error::new(ESRCH)) ++} ++ ++fn walk_robust_list_node( ++ node_ptr: usize, ++ futex_offset: isize, ++ owner_tid: u32, ++ token: &mut CleanLockToken, ++) { ++ if node_ptr == 0 { ++ return; ++ } ++ ++ let Ok(futex_addr) = node_ptr.checked_add_signed(futex_offset).ok_or(Error::new(EFAULT)) else { ++ return; ++ }; ++ let Ok(target_virtaddr) = validate_futex_u32_addr(futex_addr) else { ++ return; ++ }; ++ ++ let current_addrsp = match AddrSpace::current() { ++ Ok(addrsp) => addrsp, ++ Err(_) => return, ++ }; ++ ++ let shard = futex_shard(validate_and_translate_virt( ++ ¤t_addrsp.acquire_read(token.downgrade()), ++ target_virtaddr, ++ ).ok_or(Error::new(EFAULT)).unwrap_or_else(|_| return)); ++ ++ let mut futexes = FUTEXES[shard].lock(token.token()); ++ let (futexes, mut futex_token) = futexes.token_split(); ++ let addr_space_guard = current_addrsp.acquire_read(futex_token.downgrade()); ++ let Some(locked_physaddr) = validate_and_translate_virt(&addr_space_guard, target_virtaddr) else { ++ return; ++ }; ++ drop(addr_space_guard); ++ ++ let futex_atomic = futex_atomic_u32(locked_physaddr); ++ let current = futex_atomic.load(Ordering::SeqCst); ++ if (current & FUTEX_TID_MASK) != owner_tid { ++ return; ++ } ++ ++ let mut new = (current & FUTEX_WAITERS) | FUTEX_OWNER_DIED; ++ if let Some(queue) = futexes.get_mut(&locked_physaddr) { ++ queue.pi_owner = None; ++ let mut woke = false; ++ let mut i = 0; ++ while i < queue.waiters.len() && !woke { ++ let waiter = match queue.waiters.get(i) { ++ Some(waiter) => waiter, ++ None => break, ++ }; ++ if waiter.target_virtaddr != target_virtaddr || !Arc::downgrade(¤t_addrsp).ptr_eq(&waiter.addr_space) { ++ i += 1; ++ continue; ++ } ++ let waiter = queue.waiters.swap_remove(i); ++ waiter.context_lock.write(futex_token.token()).unblock(); ++ woke = true; ++ } ++ if !queue.waiters.is_empty() { ++ new |= FUTEX_WAITERS; ++ } ++ } ++ ++ futex_atomic.store(new, Ordering::SeqCst); ++} ++ ++pub fn cleanup_current_robust_futexes(token: &mut CleanLockToken) { ++ let context_lock = context::current(); ++ let (head_ptr, owner_tid) = { ++ let context = context_lock.read(token.token()); ++ let Some(head_ptr) = context.robust_list_head else { ++ return; ++ }; ++ (head_ptr, context_futex_tid(&context)) ++ }; ++ ++ let Ok(head) = UserSlice::ro(head_ptr, ROBUST_LIST_HEAD_SIZE) ++ .and_then(|slice| unsafe { slice.read_exact::() }) ++ else { ++ return; ++ }; ++ ++ let mut next = head.list.next; ++ let mut walked = 0; ++ while next != 0 && next != head_ptr && walked < ROBUST_LIST_LIMIT { ++ let node_ptr = next; ++ let Ok(node) = UserSlice::ro(node_ptr, size_of::()) ++ .and_then(|slice| unsafe { slice.read_exact::() }) ++ else { ++ break; ++ }; ++ walk_robust_list_node(node_ptr, head.futex_offset, owner_tid, token); ++ next = node.next; ++ walked += 1; ++ } ++ ++ if head.list_op_pending != 0 { ++ walk_robust_list_node(head.list_op_pending, head.futex_offset, owner_tid, token); ++ } ++} ++ ++pub fn set_robust_list(head: usize, len: usize, token: &mut CleanLockToken) -> Result<()> { ++ if len != ROBUST_LIST_HEAD_SIZE { ++ return Err(Error::new(EINVAL)); ++ } ++ if head != 0 { ++ UserSlice::ro(head, ROBUST_LIST_HEAD_SIZE)?; ++ } ++ ++ let current = context::current(); ++ current.write(token.token()).robust_list_head = (head != 0).then_some(head); ++ Ok(()) ++} ++ ++pub fn get_robust_list(pid: usize, head_ptr: usize, len_ptr: usize, token: &mut CleanLockToken) -> Result<()> { ++ let (head, len) = lookup_robust_list_head(pid, token)?; ++ UserSliceWo::wo(head_ptr, size_of::())?.write_usize(head)?; ++ UserSliceWo::wo(len_ptr, size_of::())?.write_usize(len)?; ++ Ok(()) ++} +diff --git a/src/syscall/mod.rs b/src/syscall/mod.rs +--- a/src/syscall/mod.rs ++++ b/src/syscall/mod.rs +@@ +-pub use self::{ +- fs::*, +- futex::futex, +- process::*, +- time::*, +- usercopy::validate_region, +-}; ++pub use self::{ ++ fs::*, ++ futex::{futex, get_robust_list, set_robust_list}, ++ process::*, ++ time::*, ++ usercopy::validate_region, ++}; +@@ ++const SYS_SET_ROBUST_LIST: usize = 311; ++const SYS_GET_ROBUST_LIST: usize = 312; +@@ + SYS_CLOCK_GETTIME => { + clock_gettime(b, UserSlice::wo(c, size_of::())?, token).map(|()| 0) + } + SYS_FUTEX => futex(b, c, d, e, f, g, token), ++ SYS_SET_ROBUST_LIST => set_robust_list(b, c, token).map(|()| 0), ++ SYS_GET_ROBUST_LIST => get_robust_list(b, c, d, token).map(|()| 0), + + SYS_MPROTECT => mprotect(b, c, MapFlags::from_bits_truncate(d), token).map(|()| 0), +diff --git a/src/syscall/process.rs b/src/syscall/process.rs +--- a/src/syscall/process.rs ++++ b/src/syscall/process.rs +@@ + pub fn exit_this_context(excp: Option, token: &mut CleanLockToken) -> ! { + let mut close_files; + let addrspace_opt; + ++ super::futex::cleanup_current_robust_futexes(token); ++ + let context_lock = context::current(); + { + let mut context = context_lock.write(token.token()); +@@ + addrspace_opt = context + .set_addr_space(None, token.downgrade()) + .and_then(|a| Arc::try_unwrap(a).ok()); ++ context.robust_list_head = None; + drop(mem::replace(&mut context.syscall_head, SyscallFrame::Dummy)); + drop(mem::replace(&mut context.syscall_tail, SyscallFrame::Dummy)); diff --git a/local/patches/kernel/P8-initial-placement.patch b/local/patches/kernel/P8-initial-placement.patch new file mode 100644 index 00000000..e7cb5b6d --- /dev/null +++ b/local/patches/kernel/P8-initial-placement.patch @@ -0,0 +1,56 @@ +diff --git a/src/context/mod.rs b/src/context/mod.rs +--- a/src/context/mod.rs ++++ b/src/context/mod.rs +@@ -10,9 +10,9 @@ use core::{num::NonZeroUsize, ops::Deref}; + + use crate::{ + context::memory::AddrSpaceWrapper, +- cpu_set::LogicalCpuSet, ++ cpu_set::{LogicalCpuId, LogicalCpuSet}, + memory::{RmmA, RmmArch, TableKind}, +- percpu::PercpuBlock, ++ percpu::{get_percpu_block, PercpuBlock}, + sync::{ + ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard, + RwLockWriteGuard, L0, L1, L2, L4, +@@ -118,6 +118,30 @@ pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextDa + RUN_CONTEXTS.lock(token) + } + ++fn least_loaded_cpu() -> LogicalCpuId { ++ let current_cpu = crate::cpu_id(); ++ let mut best_cpu = current_cpu; ++ let mut best_depth = usize::MAX; ++ ++ for raw_id in 0..crate::cpu_count() { ++ let cpu_id = LogicalCpuId::new(raw_id); ++ let Some(percpu) = get_percpu_block(cpu_id) else { ++ continue; ++ }; ++ ++ percpu.sched.take_lock(); ++ let depth = unsafe { percpu.sched.queues().iter().map(|queue| queue.len()).sum() }; ++ percpu.sched.release_lock(); ++ ++ if depth < best_depth { ++ best_depth = depth; ++ best_cpu = cpu_id; ++ } ++ } ++ ++ best_cpu ++} ++ + pub fn init(token: &mut CleanLockToken) { + let owner = None; // kmain not owned by any fd + let mut context = Context::new(owner).expect("failed to create kmain context"); +@@ -238,6 +262,9 @@ pub fn spawn( + + context.kstack = Some(stack); + context.userspace = userspace_allowed; ++ let target_cpu = least_loaded_cpu(); ++ context.sched_affinity = LogicalCpuSet::empty(); ++ context.sched_affinity.atomic_set(target_cpu); + + let context_lock = Arc::new(ContextLock::new(context)); + let context_ref = ContextRef(Arc::clone(&context_lock)); diff --git a/local/patches/kernel/P8-load-balance.patch b/local/patches/kernel/P8-load-balance.patch new file mode 100644 index 00000000..8883c992 --- /dev/null +++ b/local/patches/kernel/P8-load-balance.patch @@ -0,0 +1,146 @@ +diff --git a/src/percpu.rs b/src/percpu.rs +--- a/src/percpu.rs ++++ b/src/percpu.rs +@@ -29,12 +29,14 @@ pub struct PerCpuSched { + pub run_queues_lock: AtomicBool, + pub balance: Cell<[usize; RUN_QUEUE_COUNT]>, + pub last_queue: Cell, ++ pub last_balance_time: Cell, + } + + impl PerCpuSched { + pub const fn new() -> Self { + const EMPTY: VecDeque = VecDeque::new(); + Self { + run_queues: SyncUnsafeCell::new([EMPTY; RUN_QUEUE_COUNT]), + run_queues_lock: AtomicBool::new(false), + balance: Cell::new([0; RUN_QUEUE_COUNT]), + last_queue: Cell::new(0), ++ last_balance_time: Cell::new(0), + } + } +diff --git a/src/context/switch.rs b/src/context/switch.rs +--- a/src/context/switch.rs ++++ b/src/context/switch.rs +@@ -33,6 +33,8 @@ const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [ + 70, 56, 45, 36, 29, 23, 18, 15, + ]; + ++const LOAD_BALANCE_INTERVAL_NS: u128 = 100_000_000; ++ + static SCHED_STEAL_COUNT: AtomicUsize = AtomicUsize::new(0); +@@ -101,6 +103,9 @@ pub fn tick(token: &mut CleanLockToken) { + let new_ticks = ticks_cell.get() + 1; + ticks_cell.set(new_ticks); + ++ let balance_time = crate::time::monotonic(token); ++ maybe_balance_queues(token, percpu, balance_time); ++ + // Trigger a context switch after every 3 ticks. + if new_ticks >= 3 { + switch(token); +@@ -427,6 +432,92 @@ fn steal_work( + + None + } ++ ++fn queue_depth(percpu: &PercpuBlock) -> usize { ++ let mut sched_lock = SchedQueuesLock::new(&percpu.sched); ++ unsafe { ++ sched_lock ++ .queues_mut() ++ .iter() ++ .map(|queue| queue.len()) ++ .sum() ++ } ++} ++ ++fn migrate_one_context( ++ token: &mut CleanLockToken, ++ source_id: LogicalCpuId, ++ target_id: LogicalCpuId, ++ switch_time: u128, ++) -> bool { ++ let Some(source) = get_percpu_block(source_id) else { ++ return false; ++ }; ++ let Some(target) = get_percpu_block(target_id) else { ++ return false; ++ }; ++ ++ let source_idle = source.switch_internals.idle_context(); ++ let moved = { ++ let mut source_lock = SchedQueuesLock::new(&source.sched); ++ let source_queues = unsafe { source_lock.queues_mut() }; ++ pop_movable_context(token, source_queues, target_id, switch_time, &source_idle) ++ }; ++ ++ let Some((prio, context_ref)) = moved else { ++ return false; ++ }; ++ ++ let mut target_lock = SchedQueuesLock::new(&target.sched); ++ unsafe { ++ target_lock.queues_mut()[prio].push_back(context_ref); ++ } ++ true ++} ++ ++fn maybe_balance_queues(token: &mut CleanLockToken, percpu: &PercpuBlock, balance_time: u128) { ++ if crate::cpu_count() <= 1 || percpu.cpu_id != LogicalCpuId::BSP { ++ return; ++ } ++ if balance_time.saturating_sub(percpu.sched.last_balance_time.get()) < LOAD_BALANCE_INTERVAL_NS ++ { ++ return; ++ } ++ ++ percpu.sched.last_balance_time.set(balance_time); ++ ++ let mut depths = Vec::new(); ++ let mut total_depth = 0usize; ++ for raw_id in 0..crate::cpu_count() { ++ let cpu_id = LogicalCpuId::new(raw_id); ++ let Some(cpu_percpu) = get_percpu_block(cpu_id) else { ++ continue; ++ }; ++ let depth = queue_depth(cpu_percpu); ++ total_depth += depth; ++ depths.push((cpu_id, depth)); ++ } ++ ++ if depths.len() <= 1 || total_depth == 0 { ++ return; ++ } ++ ++ let avg_depth = (total_depth + depths.len().saturating_sub(1)) / depths.len(); ++ ++ for target_index in 0..depths.len() { ++ if depths[target_index].1 != 0 { ++ continue; ++ } ++ ++ let mut source_index = None; ++ let mut source_depth = 0usize; ++ for (idx, &(_, depth)) in depths.iter().enumerate() { ++ if idx == target_index { ++ continue; ++ } ++ if depth > avg_depth + 1 && depth > source_depth { ++ source_index = Some(idx); ++ source_depth = depth; ++ } ++ } ++ ++ let Some(source_index) = source_index else { ++ continue; ++ }; ++ ++ let source_id = depths[source_index].0; ++ let target_id = depths[target_index].0; ++ if migrate_one_context(token, source_id, target_id, balance_time) { ++ depths[source_index].1 = depths[source_index].1.saturating_sub(1); ++ depths[target_index].1 += 1; ++ } ++ } ++} diff --git a/local/patches/kernel/P8-percpu-sched.patch b/local/patches/kernel/P8-percpu-sched.patch new file mode 100644 index 00000000..0628db13 --- /dev/null +++ b/local/patches/kernel/P8-percpu-sched.patch @@ -0,0 +1,123 @@ +diff --git a/src/percpu.rs b/src/percpu.rs +index f4ad5e6..da10036 100644 +--- a/src/percpu.rs ++++ b/src/percpu.rs +@@ -1,9 +1,10 @@ + use alloc::{ ++ collections::VecDeque, + sync::{Arc, Weak}, + vec::Vec, + }; + use core::{ +- cell::{Cell, RefCell}, ++ cell::{Cell, RefCell, SyncUnsafeCell}, + sync::atomic::{AtomicBool, AtomicPtr, Ordering}, + }; + +@@ -12,7 +13,10 @@ use syscall::PtraceFlags; + + use crate::{ + arch::device::ArchPercpuMisc, +- context::{empty_cr3, memory::AddrSpaceWrapper, switch::ContextSwitchPercpu}, ++ context::{ ++ empty_cr3, memory::AddrSpaceWrapper, switch::ContextSwitchPercpu, WeakContextRef, ++ RUN_QUEUE_COUNT, ++ }, + cpu_set::{LogicalCpuId, MAX_CPU_COUNT}, + cpu_stats::{CpuStats, CpuStatsData}, + ptrace::Session, +@@ -20,6 +24,58 @@ use crate::{ + syscall::debug::SyscallDebugInfo, + }; + ++#[allow(dead_code)] ++pub struct PerCpuSched { ++ pub run_queues: SyncUnsafeCell<[VecDeque; RUN_QUEUE_COUNT]>, ++ pub run_queues_lock: AtomicBool, ++ pub balance: Cell<[usize; RUN_QUEUE_COUNT]>, ++ pub last_queue: Cell, ++ pub last_balance_time: Cell, ++} ++ ++impl PerCpuSched { ++ pub const fn new() -> Self { ++ const EMPTY: VecDeque = VecDeque::new(); ++ Self { ++ run_queues: SyncUnsafeCell::new([EMPTY; RUN_QUEUE_COUNT]), ++ run_queues_lock: AtomicBool::new(false), ++ balance: Cell::new([0; RUN_QUEUE_COUNT]), ++ last_queue: Cell::new(0), ++ last_balance_time: Cell::new(0), ++ } ++ } ++ ++ pub fn take_lock(&self) { ++ while self ++ .run_queues_lock ++ .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed) ++ .is_err() ++ { ++ while self.run_queues_lock.load(Ordering::Relaxed) { ++ core::hint::spin_loop(); ++ } ++ } ++ } ++ ++ pub fn release_lock(&self) { ++ self.run_queues_lock.store(false, Ordering::Release); ++ } ++ ++ /// # Safety ++ /// ++ /// The caller must hold `run_queues_lock` while accessing the returned reference. ++ pub unsafe fn queues(&self) -> &[VecDeque; RUN_QUEUE_COUNT] { ++ unsafe { &*self.run_queues.get() } ++ } ++ ++ /// # Safety ++ /// ++ /// The caller must hold `run_queues_lock` while accessing the returned reference. ++ pub unsafe fn queues_mut(&self) -> &mut [VecDeque; RUN_QUEUE_COUNT] { ++ unsafe { &mut *self.run_queues.get() } ++ } ++} ++ + /// The percpu block, that stored all percpu variables. + pub struct PercpuBlock { + /// A unique immutable number that identifies the current CPU - used for scheduling +@@ -31,8 +87,8 @@ pub struct PercpuBlock { + pub current_addrsp: RefCell>>, + pub new_addrsp_tmp: Cell>>, + pub wants_tlb_shootdown: AtomicBool, +- pub balance: Cell<[usize; 40]>, +- pub last_queue: Cell, ++ ++ pub sched: PerCpuSched, + + // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it + // first to avoid cache invalidation. +@@ -57,6 +113,14 @@ pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) { + ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release) + } + ++pub fn get_percpu_block(id: LogicalCpuId) -> Option<&'static PercpuBlock> { ++ unsafe { ++ ALL_PERCPU_BLOCKS[id.get() as usize] ++ .load(Ordering::Acquire) ++ .as_ref() ++ } ++} ++ + pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> { + let mut res = ALL_PERCPU_BLOCKS + .iter() +@@ -187,8 +251,7 @@ impl PercpuBlock { + current_addrsp: RefCell::new(None), + new_addrsp_tmp: Cell::new(None), + wants_tlb_shootdown: AtomicBool::new(false), +- balance: Cell::new([0; 40]), +- last_queue: Cell::new(39), ++ sched: PerCpuSched::new(), + ptrace_flags: Cell::new(PtraceFlags::empty()), + ptrace_session: RefCell::new(None), + inside_syscall: Cell::new(false), diff --git a/local/patches/kernel/P8-percpu-wiring.patch b/local/patches/kernel/P8-percpu-wiring.patch new file mode 100644 index 00000000..9a952f41 --- /dev/null +++ b/local/patches/kernel/P8-percpu-wiring.patch @@ -0,0 +1,985 @@ +diff --git a/src/context/switch.rs b/src/context/switch.rs +index 86684c8..d054734 100644 +--- a/src/context/switch.rs ++++ b/src/context/switch.rs +@@ -5,18 +5,18 @@ + use crate::{ + context::{ + self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard, +- Context, ContextLock, WeakContextRef, ++ Context, ContextLock, SchedPolicy, WeakContextRef, RUN_QUEUE_COUNT, + }, +- cpu_set::LogicalCpuId, ++ cpu_set::{LogicalCpuId, LogicalCpuSet}, + cpu_stats::{self, CpuState}, +- percpu::PercpuBlock, +- sync::{ArcRwLockWriteGuard, CleanLockToken, L4}, ++ percpu::{get_percpu_block, PerCpuSched, PercpuBlock}, ++ sync::{ArcRwLockWriteGuard, CleanLockToken, LockToken, L1, L4}, + }; + use alloc::{sync::Arc, vec::Vec}; + use core::{ + cell::{Cell, RefCell}, + hint, mem, +- sync::atomic::Ordering, ++ sync::atomic::{AtomicUsize, Ordering}, + }; + use syscall::PtraceFlags; + +@@ -33,35 +33,49 @@ const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [ + 70, 56, 45, 36, 29, 23, 18, 15, + ]; + +-/// Determines if a given context is eligible to be scheduled on a given CPU (in +-/// principle, the current CPU). +-/// +-/// # Safety +-/// This function is unsafe because it modifies the `context`'s state directly without synchronization. +-/// +-/// # Parameters +-/// - `context`: The context (process/thread) to be checked. +-/// - `cpu_id`: The logical ID of the CPU on which the context is being scheduled. +-/// +-/// # Returns +-/// - `UpdateResult::CanSwitch`: If the context can be switched to. +-/// - `UpdateResult::Skip`: If the context should be skipped (e.g., it's running on another CPU). ++const LOAD_BALANCE_INTERVAL_NS: u128 = 100_000_000; ++ ++static SCHED_STEAL_COUNT: AtomicUsize = AtomicUsize::new(0); ++ ++struct SchedQueuesLock<'a> { ++ sched: &'a PerCpuSched, ++} ++ ++impl<'a> SchedQueuesLock<'a> { ++ fn new(sched: &'a PerCpuSched) -> Self { ++ sched.take_lock(); ++ Self { sched } ++ } ++ ++ unsafe fn queues_mut( ++ &mut self, ++ ) -> &mut [alloc::collections::VecDeque; RUN_QUEUE_COUNT] { ++ unsafe { self.sched.queues_mut() } ++ } ++} ++ ++impl Drop for SchedQueuesLock<'_> { ++ fn drop(&mut self) { ++ self.sched.release_lock(); ++ } ++} ++ ++fn assign_context_to_cpu(context: &mut Context, cpu_id: LogicalCpuId) { ++ context.sched_affinity = LogicalCpuSet::empty(); ++ context.sched_affinity.atomic_set(cpu_id); ++} ++ + unsafe fn update_runnable( + context: &mut Context, + cpu_id: LogicalCpuId, + switch_time: u128, + ) -> UpdateResult { +- // Ignore contexts that are already running. + if context.running { + return UpdateResult::Skip; + } +- +- // Ignore contexts assigned to other CPUs. + if !context.sched_affinity.contains(cpu_id) { + return UpdateResult::Skip; + } +- +- // If context is soft-blocked and has a wake-up time, check if it should wake up. + if context.status.is_soft_blocked() + && let Some(wake) = context.wake + && switch_time >= wake +@@ -69,8 +83,6 @@ unsafe fn update_runnable( + context.wake = None; + context.unblock_no_ipi(); + } +- +- // If the context is runnable, indicate it can be switched to. + if context.status.is_runnable() { + UpdateResult::CanSwitch + } else { +@@ -90,12 +102,16 @@ struct SwitchResultInner { + /// + /// The function also calls the signal handler after switching contexts. + pub fn tick(token: &mut CleanLockToken) { +- let ticks_cell = &PercpuBlock::current().switch_internals.pit_ticks; ++ let percpu = PercpuBlock::current(); ++ let ticks_cell = &percpu.switch_internals.pit_ticks; + + let new_ticks = ticks_cell.get() + 1; + ticks_cell.set(new_ticks); + +- // Trigger a context switch after every 3 ticks (approx. 6.75 ms). ++ let balance_time = crate::time::monotonic(token); ++ maybe_balance_queues(token, percpu, balance_time); ++ ++ // Trigger a context switch after every 3 ticks. + if new_ticks >= 3 { + switch(token); + crate::context::signal::signal_handler(token); +@@ -167,22 +183,12 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + let mut prev_context_guard = unsafe { prev_context_lock.write_arc() }; + + if !prev_context_guard.is_preemptable() { +- // Unset global lock + arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); +- +- // Pretend to have finished switching, so CPU is not idled + return SwitchResult::Switched; + } + + // Alarm (previously in update_runnable) +- let wakeups = wakeup_contexts(token, switch_time); +- +- if wakeups.len() > 0 { +- let mut run_contexts = run_contexts(token.token()); +- for (prio, context_lock) in wakeups { +- run_contexts.set[prio].push_back(context_lock); +- } +- } ++ wakeup_contexts(token, percpu, switch_time); + + let cpu_id = crate::cpu_id(); + +@@ -213,6 +219,7 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + + // Set the previous context as "not running" + prev_context.running = false; ++ prev_context.last_cpu = prev_context.cpu_id; + + // Set the next context as "running" + next_context.running = true; +@@ -222,6 +229,14 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + // Update times + if !was_idle { + prev_context.cpu_time += switch_time.saturating_sub(prev_context.switch_time); ++ if prev_context.sched_policy == SchedPolicy::Other { ++ let actual_ns = switch_time.saturating_sub(prev_context.switch_time); ++ let weight = ++ SCHED_PRIO_TO_WEIGHT[prev_context.sched_static_prio.min(39)] as u128; ++ let default_weight = SCHED_PRIO_TO_WEIGHT[20] as u128; ++ let delta = actual_ns.saturating_mul(default_weight) / weight.max(1); ++ prev_context.vruntime = prev_context.vruntime.saturating_add(delta); ++ } + } + next_context.switch_time = switch_time; + if next_context.userspace { +@@ -302,13 +317,234 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + } + } + +-fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize, WeakContextRef)> { ++fn queue_previous_context( ++ token: &mut CleanLockToken, ++ percpu: &PercpuBlock, ++ prev_context_lock: &Arc, ++ prev_context_guard: &ArcRwLockWriteGuard, ++ idle_context: &Arc, ++) { ++ if Arc::ptr_eq(prev_context_lock, idle_context) { ++ return; ++ } ++ ++ let prev_ctx = WeakContextRef(Arc::downgrade(prev_context_lock)); ++ if prev_context_guard.status.is_runnable() { ++ let prio = prev_context_guard.prio; ++ let mut sched_lock = SchedQueuesLock::new(&percpu.sched); ++ unsafe { ++ sched_lock.queues_mut()[prio].push_back(prev_ctx); ++ } ++ } else { ++ idle_contexts(token.downgrade()).push_back(prev_ctx); ++ } ++} ++ ++fn pop_movable_context( ++ token: &mut CleanLockToken, ++ queues: &mut [alloc::collections::VecDeque; RUN_QUEUE_COUNT], ++ target_cpu: LogicalCpuId, ++ switch_time: u128, ++ idle_context: &Arc, ++) -> Option<(usize, WeakContextRef)> { ++ for prio in 0..RUN_QUEUE_COUNT { ++ let len = queues[prio].len(); ++ for _ in 0..len { ++ let Some(context_ref) = queues[prio].pop_front() else { ++ break; ++ }; ++ let Some(context_lock) = context_ref.upgrade() else { ++ continue; ++ }; ++ if Arc::ptr_eq(&context_lock, idle_context) { ++ queues[prio].push_back(context_ref); ++ continue; ++ } ++ ++ let mut context_guard = unsafe { context_lock.write_arc() }; ++ let sw = unsafe { update_stealable(&mut context_guard, switch_time) }; ++ if let UpdateResult::CanSwitch = sw { ++ assign_context_to_cpu(&mut context_guard, target_cpu); ++ let moved_ref = WeakContextRef(Arc::downgrade(ArcContextLockWriteGuard::rwlock( ++ &context_guard, ++ ))); ++ drop(context_guard); ++ return Some((prio, moved_ref)); ++ } ++ ++ if matches!(sw, UpdateResult::Blocked) { ++ idle_contexts(token.downgrade()).push_back(context_ref); ++ } else { ++ queues[prio].push_back(context_ref); ++ } ++ } ++ } ++ ++ None ++} ++ ++fn steal_work( ++ token: &mut CleanLockToken, ++ cpu_id: LogicalCpuId, ++ switch_time: u128, ++) -> Option { ++ let cpu_count = crate::cpu_count(); ++ if cpu_count <= 1 { ++ return None; ++ } ++ ++ for offset in 1..cpu_count { ++ let victim_id = LogicalCpuId::new((cpu_id.get() + offset) % cpu_count); ++ let Some(victim) = get_percpu_block(victim_id) else { ++ continue; ++ }; ++ ++ let victim_idle = victim.switch_internals.idle_context(); ++ let mut victim_lock = SchedQueuesLock::new(&victim.sched); ++ let victim_queues = unsafe { victim_lock.queues_mut() }; ++ ++ for prio in 0..RUN_QUEUE_COUNT { ++ let len = victim_queues[prio].len(); ++ for _ in 0..len { ++ let Some(context_ref) = victim_queues[prio].pop_front() else { ++ break; ++ }; ++ let Some(context_lock) = context_ref.upgrade() else { ++ continue; ++ }; ++ if Arc::ptr_eq(&context_lock, &victim_idle) { ++ victim_queues[prio].push_back(context_ref); ++ continue; ++ } ++ ++ let mut context_guard = unsafe { context_lock.write_arc() }; ++ let sw = unsafe { update_stealable(&mut context_guard, switch_time) }; ++ if let UpdateResult::CanSwitch = sw { ++ assign_context_to_cpu(&mut context_guard, cpu_id); ++ SCHED_STEAL_COUNT.fetch_add(1, Ordering::Relaxed); ++ return Some(context_guard); ++ } ++ ++ if matches!(sw, UpdateResult::Blocked) { ++ idle_contexts(token.downgrade()).push_back(context_ref); ++ } else { ++ victim_queues[prio].push_back(context_ref); ++ } ++ } ++ } ++ } ++ ++ None ++} ++ ++fn queue_depth(percpu: &PercpuBlock) -> usize { ++ let mut sched_lock = SchedQueuesLock::new(&percpu.sched); ++ unsafe { ++ sched_lock ++ .queues_mut() ++ .iter() ++ .map(|queue| queue.len()) ++ .sum() ++ } ++} ++ ++fn migrate_one_context( ++ token: &mut CleanLockToken, ++ source_id: LogicalCpuId, ++ target_id: LogicalCpuId, ++ switch_time: u128, ++) -> bool { ++ let Some(source) = get_percpu_block(source_id) else { ++ return false; ++ }; ++ let Some(target) = get_percpu_block(target_id) else { ++ return false; ++ }; ++ ++ let source_idle = source.switch_internals.idle_context(); ++ let moved = { ++ let mut source_lock = SchedQueuesLock::new(&source.sched); ++ let source_queues = unsafe { source_lock.queues_mut() }; ++ pop_movable_context(token, source_queues, target_id, switch_time, &source_idle) ++ }; ++ ++ let Some((prio, context_ref)) = moved else { ++ return false; ++ }; ++ ++ let mut target_lock = SchedQueuesLock::new(&target.sched); ++ unsafe { ++ target_lock.queues_mut()[prio].push_back(context_ref); ++ } ++ true ++} ++ ++fn maybe_balance_queues(token: &mut CleanLockToken, percpu: &PercpuBlock, balance_time: u128) { ++ if crate::cpu_count() <= 1 || percpu.cpu_id != LogicalCpuId::BSP { ++ return; ++ } ++ if balance_time.saturating_sub(percpu.sched.last_balance_time.get()) < LOAD_BALANCE_INTERVAL_NS ++ { ++ return; ++ } ++ ++ percpu.sched.last_balance_time.set(balance_time); ++ ++ let mut depths = Vec::new(); ++ let mut total_depth = 0usize; ++ for raw_id in 0..crate::cpu_count() { ++ let cpu_id = LogicalCpuId::new(raw_id); ++ let Some(cpu_percpu) = get_percpu_block(cpu_id) else { ++ continue; ++ }; ++ let depth = queue_depth(cpu_percpu); ++ total_depth += depth; ++ depths.push((cpu_id, depth)); ++ } ++ ++ if depths.len() <= 1 || total_depth == 0 { ++ return; ++ } ++ ++ let avg_depth = (total_depth + depths.len().saturating_sub(1)) / depths.len(); ++ ++ for target_index in 0..depths.len() { ++ if depths[target_index].1 != 0 { ++ continue; ++ } ++ ++ let mut source_index = None; ++ let mut source_depth = 0usize; ++ for (idx, &(_, depth)) in depths.iter().enumerate() { ++ if idx == target_index { ++ continue; ++ } ++ if depth > avg_depth + 1 && depth > source_depth { ++ source_index = Some(idx); ++ source_depth = depth; ++ } ++ } ++ ++ let Some(source_index) = source_index else { ++ continue; ++ }; ++ ++ let source_id = depths[source_index].0; ++ let target_id = depths[target_index].0; ++ if migrate_one_context(token, source_id, target_id, balance_time) { ++ depths[source_index].1 = depths[source_index].1.saturating_sub(1); ++ depths[target_index].1 += 1; ++ } ++ } ++} ++ ++fn wakeup_contexts(token: &mut CleanLockToken, percpu: &PercpuBlock, switch_time: u128) { + // TODO: Optimise this somehow. Perhaps using a separate timer queue? + let mut wakeups = Vec::new(); + let current_context = context::current(); + let Some(idle_contexts) = idle_contexts_try(token.downgrade()) else { + // other cpus may spawning or killing contexts so let's skip wakeups to avoid contention +- return wakeups; ++ return; + }; + let (mut idle_contexts, mut token) = idle_contexts.into_split(); + let len = idle_contexts.len(); +@@ -327,15 +563,14 @@ fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize, + idle_contexts.push_back(context_ref); + continue; + }; +- if guard.status.is_soft_blocked() { +- if let Some(wake) = guard.wake { +- if switch_time >= wake { +- let prio = guard.prio; +- drop(guard); +- wakeups.push((prio, context_ref)); +- continue; +- } +- } ++ if guard.status.is_soft_blocked() ++ && let Some(wake) = guard.wake ++ && switch_time >= wake ++ { ++ let prio = guard.prio; ++ drop(guard); ++ wakeups.push((prio, context_ref)); ++ continue; + } + + if guard.status.is_runnable() && !guard.running { +@@ -348,43 +583,127 @@ fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize, + drop(guard); + idle_contexts.push_back(context_ref); + } +- wakeups ++ ++ if wakeups.is_empty() { ++ return; ++ } ++ ++ let mut sched_lock = SchedQueuesLock::new(&percpu.sched); ++ let run_queues = unsafe { sched_lock.queues_mut() }; ++ for (prio, context_ref) in wakeups { ++ if let Some(context_lock) = context_ref.upgrade() { ++ let mut context_guard = unsafe { context_lock.write_arc() }; ++ assign_context_to_cpu(&mut context_guard, percpu.cpu_id); ++ } ++ run_queues[prio].push_back(context_ref); ++ } + } + +-/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler +-fn select_next_context( ++fn pick_next_from_queues( + token: &mut CleanLockToken, +- percpu: &PercpuBlock, ++ contexts_list: &mut [alloc::collections::VecDeque; RUN_QUEUE_COUNT], + cpu_id: LogicalCpuId, + switch_time: u128, +- was_idle: bool, +- prev_context_guard: &mut ArcRwLockWriteGuard, +-) -> Result, SwitchResult> { +- let contexts_data = run_contexts(token.token()); +- let (mut contexts_data, mut token) = contexts_data.into_split(); +- let contexts_list = &mut contexts_data.set; +- let idle_context = percpu.switch_internals.idle_context(); +- let mut balance = percpu.balance.get(); +- let mut i = percpu.last_queue.get() % 40; +- +- // Lock the previous context. +- let prev_context_lock = crate::context::current(); +- ++ prev_context_lock: &Arc, ++ idle_context: &Arc, ++ balance: &mut [usize; RUN_QUEUE_COUNT], ++ i: &mut usize, ++) -> Option { + let mut empty_queues = 0; + let mut total_iters = 0; +- let mut next_context_guard_opt = None; +- + let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum(); + let mut skipped_contexts = 0; + ++ for prio in 0..RUN_QUEUE_COUNT { ++ let rt_contexts = contexts_list ++ .get_mut(prio) ++ .expect("prio should be between [0, 39]"); ++ let len = rt_contexts.len(); ++ for _ in 0..len { ++ let (rt_ref, rt_lock) = match rt_contexts.pop_front() { ++ Some(lock) => match lock.upgrade() { ++ Some(l) => (lock, l), ++ None => { ++ skipped_contexts += 1; ++ continue; ++ } ++ }, ++ None => break, ++ }; ++ if Arc::ptr_eq(&rt_lock, idle_context) || Arc::ptr_eq(&rt_lock, prev_context_lock) { ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ let rt_guard = unsafe { rt_lock.write_arc() }; ++ if !rt_guard.status.is_runnable() ++ || rt_guard.running ++ || !rt_guard.sched_affinity.contains(cpu_id) ++ { ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ if rt_guard.sched_policy == SchedPolicy::Fifo ++ || rt_guard.sched_policy == SchedPolicy::RoundRobin ++ { ++ return Some(rt_guard); ++ } ++ rt_contexts.push_back(rt_ref); ++ } ++ } ++ ++ { ++ let mut min_vruntime = u128::MAX; ++ let mut best: Option<(usize, WeakContextRef)> = None; ++ for (prio, queue) in contexts_list.iter().enumerate() { ++ for ctx_ref in queue.iter() { ++ if let Some(ctx_lock) = ctx_ref.upgrade() { ++ if Arc::ptr_eq(&ctx_lock, prev_context_lock) ++ || Arc::ptr_eq(&ctx_lock, idle_context) ++ { ++ continue; ++ } ++ if let Some(guard) = ctx_lock.try_read(token.token()) { ++ if guard.status.is_runnable() ++ && !guard.running ++ && guard.sched_affinity.contains(cpu_id) ++ && guard.sched_policy == SchedPolicy::Other ++ { ++ let mut vruntime = guard.vruntime; ++ if guard.last_cpu == Some(cpu_id) { ++ vruntime = vruntime.saturating_sub(vruntime / 8); ++ } ++ drop(guard); ++ if vruntime < min_vruntime { ++ min_vruntime = vruntime; ++ best = Some((prio, ctx_ref.clone())); ++ } ++ } ++ } ++ } ++ } ++ } ++ if let Some((best_prio, ctx_ref)) = best { ++ contexts_list[best_prio].retain(|r| !WeakContextRef::eq(r, &ctx_ref)); ++ if let Some(ctx_lock) = ctx_ref.upgrade() { ++ let guard = unsafe { ctx_lock.write_arc() }; ++ if guard.status.is_runnable() ++ && !guard.running ++ && guard.sched_affinity.contains(cpu_id) ++ && guard.sched_policy == SchedPolicy::Other ++ { ++ return Some(guard); ++ } ++ ++ drop(guard); ++ contexts_list[best_prio].push_back(ctx_ref); ++ } ++ } ++ } ++ + 'priority: loop { +- i = (i + 1) % 40; ++ *i = (*i + 1) % RUN_QUEUE_COUNT; + total_iters += 1; + +- // The least prioritised queue takes <5000 iters to build up +- // balance = sched_prio_to_weight[20], if we have already spent +- // that many iters and not found any context, it is better to just +- // skip for now + if total_iters >= 5000 { + break 'priority; + } +@@ -394,24 +713,21 @@ fn select_next_context( + } + + let contexts = contexts_list +- .get_mut(i) ++ .get_mut(*i) + .expect("i should be between [0, 39]!"); + + if contexts.is_empty() { + empty_queues += 1; +- if empty_queues >= 40 { +- // If all queues are empty, just break out ++ if empty_queues >= RUN_QUEUE_COUNT { + break 'priority; + } + continue; +- } else { +- empty_queues = 0; + } + +- if balance[i] < SCHED_PRIO_TO_WEIGHT[20] { +- // This queue does not have enough balance to run, +- // increment the balance! +- balance[i] += SCHED_PRIO_TO_WEIGHT[i]; ++ empty_queues = 0; ++ ++ if balance[*i] < SCHED_PRIO_TO_WEIGHT[20] { ++ balance[*i] += SCHED_PRIO_TO_WEIGHT[*i]; + continue; + } + +@@ -422,67 +738,331 @@ fn select_next_context( + Some(new_lock) => (lock, new_lock), + None => { + skipped_contexts += 1; +- continue; // Ghost Process, just continue ++ continue; + } + }, +- None => break, // Empty Queue ++ None => break, + }; + +- if Arc::ptr_eq(&next_context_lock, &prev_context_lock) { ++ if Arc::ptr_eq(&next_context_lock, prev_context_lock) ++ || Arc::ptr_eq(&next_context_lock, idle_context) ++ { + contexts.push_back(next_context_ref); + continue; + } +- if Arc::ptr_eq(&next_context_lock, &idle_context) { ++ let mut next_context_guard = unsafe { next_context_lock.write_arc() }; ++ ++ let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) }; ++ if let UpdateResult::CanSwitch = sw { ++ balance[*i] -= SCHED_PRIO_TO_WEIGHT[20]; ++ return Some(next_context_guard); ++ } ++ ++ if matches!(sw, UpdateResult::Blocked) { ++ idle_contexts(token.downgrade()).push_back(next_context_ref); ++ } else { ++ contexts.push_back(next_context_ref); ++ } ++ skipped_contexts += 1; ++ ++ if skipped_contexts >= total_contexts { ++ break 'priority; ++ } ++ } ++ } ++ ++ None ++} ++ ++fn pick_next_from_global_queues( ++ token: &mut LockToken, ++ contexts_list: &mut [alloc::collections::VecDeque; RUN_QUEUE_COUNT], ++ cpu_id: LogicalCpuId, ++ switch_time: u128, ++ prev_context_lock: &Arc, ++ idle_context: &Arc, ++ balance: &mut [usize; RUN_QUEUE_COUNT], ++ i: &mut usize, ++) -> Option { ++ let mut empty_queues = 0; ++ let mut total_iters = 0; ++ let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum(); ++ let mut skipped_contexts = 0; ++ ++ for prio in 0..RUN_QUEUE_COUNT { ++ let rt_contexts = contexts_list ++ .get_mut(prio) ++ .expect("prio should be between [0, 39]"); ++ let len = rt_contexts.len(); ++ for _ in 0..len { ++ let (rt_ref, rt_lock) = match rt_contexts.pop_front() { ++ Some(lock) => match lock.upgrade() { ++ Some(l) => (lock, l), ++ None => { ++ skipped_contexts += 1; ++ continue; ++ } ++ }, ++ None => break, ++ }; ++ if Arc::ptr_eq(&rt_lock, idle_context) || Arc::ptr_eq(&rt_lock, prev_context_lock) { ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ let rt_guard = unsafe { rt_lock.write_arc() }; ++ if !rt_guard.status.is_runnable() ++ || rt_guard.running ++ || !rt_guard.sched_affinity.contains(cpu_id) ++ { ++ rt_contexts.push_back(rt_ref); ++ continue; ++ } ++ if rt_guard.sched_policy == SchedPolicy::Fifo ++ || rt_guard.sched_policy == SchedPolicy::RoundRobin ++ { ++ return Some(rt_guard); ++ } ++ rt_contexts.push_back(rt_ref); ++ } ++ } ++ ++ { ++ let mut min_vruntime = u128::MAX; ++ let mut best: Option<(usize, WeakContextRef)> = None; ++ for (prio, queue) in contexts_list.iter().enumerate() { ++ for ctx_ref in queue.iter() { ++ if let Some(ctx_lock) = ctx_ref.upgrade() { ++ if Arc::ptr_eq(&ctx_lock, prev_context_lock) ++ || Arc::ptr_eq(&ctx_lock, idle_context) ++ { ++ continue; ++ } ++ if let Some(guard) = ctx_lock.try_read(token.token()) { ++ if guard.status.is_runnable() ++ && !guard.running ++ && guard.sched_affinity.contains(cpu_id) ++ && guard.sched_policy == SchedPolicy::Other ++ { ++ let mut vruntime = guard.vruntime; ++ if guard.last_cpu == Some(cpu_id) { ++ vruntime = vruntime.saturating_sub(vruntime / 8); ++ } ++ drop(guard); ++ if vruntime < min_vruntime { ++ min_vruntime = vruntime; ++ best = Some((prio, ctx_ref.clone())); ++ } ++ } ++ } ++ } ++ } ++ } ++ if let Some((best_prio, ctx_ref)) = best { ++ contexts_list[best_prio].retain(|r| !WeakContextRef::eq(r, &ctx_ref)); ++ if let Some(ctx_lock) = ctx_ref.upgrade() { ++ let guard = unsafe { ctx_lock.write_arc() }; ++ if guard.status.is_runnable() ++ && !guard.running ++ && guard.sched_affinity.contains(cpu_id) ++ && guard.sched_policy == SchedPolicy::Other ++ { ++ return Some(guard); ++ } ++ ++ drop(guard); ++ contexts_list[best_prio].push_back(ctx_ref); ++ } ++ } ++ } ++ ++ 'priority: loop { ++ *i = (*i + 1) % RUN_QUEUE_COUNT; ++ total_iters += 1; ++ ++ if total_iters >= 5000 { ++ break 'priority; ++ } ++ ++ if skipped_contexts > total_contexts && total_contexts > 0 { ++ break 'priority; ++ } ++ ++ let contexts = contexts_list ++ .get_mut(*i) ++ .expect("i should be between [0, 39]!"); ++ ++ if contexts.is_empty() { ++ empty_queues += 1; ++ if empty_queues >= RUN_QUEUE_COUNT { ++ break 'priority; ++ } ++ continue; ++ } ++ ++ empty_queues = 0; ++ ++ if balance[*i] < SCHED_PRIO_TO_WEIGHT[20] { ++ balance[*i] += SCHED_PRIO_TO_WEIGHT[*i]; ++ continue; ++ } ++ ++ let len = contexts.len(); ++ for _ in 0..len { ++ let (next_context_ref, next_context_lock) = match contexts.pop_front() { ++ Some(lock) => match lock.upgrade() { ++ Some(new_lock) => (lock, new_lock), ++ None => { ++ skipped_contexts += 1; ++ continue; ++ } ++ }, ++ None => break, ++ }; ++ ++ if Arc::ptr_eq(&next_context_lock, prev_context_lock) ++ || Arc::ptr_eq(&next_context_lock, idle_context) ++ { + contexts.push_back(next_context_ref); + continue; + } + let mut next_context_guard = unsafe { next_context_lock.write_arc() }; + +- // Is this context runnable on this CPU? + let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) }; + if let UpdateResult::CanSwitch = sw { +- next_context_guard_opt = Some(next_context_guard); +- balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; +- break 'priority; ++ balance[*i] -= SCHED_PRIO_TO_WEIGHT[20]; ++ return Some(next_context_guard); ++ } ++ ++ if matches!(sw, UpdateResult::Blocked) { ++ idle_contexts(token.token()).push_back(next_context_ref); + } else { +- if matches!(sw, UpdateResult::Blocked) { +- idle_contexts(token.token()).push_back(next_context_ref); +- } else { +- contexts.push_back(next_context_ref); +- }; +- skipped_contexts += 1; ++ contexts.push_back(next_context_ref); ++ } ++ skipped_contexts += 1; + +- if skipped_contexts >= total_contexts { +- break 'priority; +- } ++ if skipped_contexts >= total_contexts { ++ break 'priority; + } + } + } +- percpu.balance.set(balance); +- percpu.last_queue.set(i); +- +- if !Arc::ptr_eq(&prev_context_lock, &idle_context) { +- // Send the old process to the back of the line (if it is still runnable) +- let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock)); +- if prev_context_guard.status.is_runnable() { +- let prio = prev_context_guard.prio; +- contexts_list[prio].push_back(prev_ctx); +- } else { +- idle_contexts(token.token()).push_back(prev_ctx); +- } ++ ++ None ++} ++ ++unsafe fn update_stealable(context: &mut Context, switch_time: u128) -> UpdateResult { ++ if context.running { ++ return UpdateResult::Skip; + } ++ if context.status.is_soft_blocked() ++ && let Some(wake) = context.wake ++ && switch_time >= wake ++ { ++ context.wake = None; ++ context.unblock_no_ipi(); ++ } ++ if context.status.is_runnable() { ++ UpdateResult::CanSwitch ++ } else { ++ UpdateResult::Blocked ++ } ++} + +- if let Some(next_context_guard) = next_context_guard_opt { +- // We found a new process! ++/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler ++fn select_next_context( ++ token: &mut CleanLockToken, ++ percpu: &PercpuBlock, ++ cpu_id: LogicalCpuId, ++ switch_time: u128, ++ was_idle: bool, ++ prev_context_guard: &mut ArcRwLockWriteGuard, ++) -> Result, SwitchResult> { ++ let idle_context = percpu.switch_internals.idle_context(); ++ let prev_context_lock = crate::context::current(); ++ ++ let local_next = { ++ let mut sched_lock = SchedQueuesLock::new(&percpu.sched); ++ let mut balance = percpu.sched.balance.get(); ++ let mut last_queue = percpu.sched.last_queue.get() % RUN_QUEUE_COUNT; ++ let next = pick_next_from_queues( ++ token, ++ unsafe { sched_lock.queues_mut() }, ++ cpu_id, ++ switch_time, ++ &prev_context_lock, ++ &idle_context, ++ &mut balance, ++ &mut last_queue, ++ ); ++ percpu.sched.balance.set(balance); ++ percpu.sched.last_queue.set(last_queue); ++ next ++ }; ++ ++ if let Some(next_context_guard) = local_next { ++ queue_previous_context( ++ token, ++ percpu, ++ &prev_context_lock, ++ prev_context_guard, ++ &idle_context, ++ ); ++ return Ok(Some(next_context_guard)); ++ } ++ ++ if let Some(next_context_guard) = steal_work(token, cpu_id, switch_time) { ++ queue_previous_context( ++ token, ++ percpu, ++ &prev_context_lock, ++ prev_context_guard, ++ &idle_context, ++ ); ++ return Ok(Some(next_context_guard)); ++ } ++ ++ let global_next = { ++ let contexts_data = run_contexts(token.token()); ++ let (mut contexts_data, mut contexts_token) = contexts_data.into_split(); ++ let mut balance = percpu.sched.balance.get(); ++ let mut last_queue = percpu.sched.last_queue.get() % RUN_QUEUE_COUNT; ++ let next = pick_next_from_global_queues( ++ &mut contexts_token, ++ &mut contexts_data.set, ++ cpu_id, ++ switch_time, ++ &prev_context_lock, ++ &idle_context, ++ &mut balance, ++ &mut last_queue, ++ ); ++ percpu.sched.balance.set(balance); ++ percpu.sched.last_queue.set(last_queue); ++ next ++ }; ++ ++ if let Some(next_context_guard) = global_next { ++ queue_previous_context( ++ token, ++ percpu, ++ &prev_context_lock, ++ prev_context_guard, ++ &idle_context, ++ ); + return Ok(Some(next_context_guard)); ++ } ++ ++ queue_previous_context( ++ token, ++ percpu, ++ &prev_context_lock, ++ prev_context_guard, ++ &idle_context, ++ ); ++ ++ if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) { ++ Ok(Some(unsafe { idle_context.write_arc() })) + } else { +- if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) { +- // We switch into the idle context +- Ok(Some(unsafe { idle_context.write_arc() })) +- } else { +- // We found no other process to run. +- Ok(None) +- } ++ Ok(None) + } + } + diff --git a/local/patches/kernel/P8-work-stealing.patch b/local/patches/kernel/P8-work-stealing.patch new file mode 100644 index 00000000..e4c9a25d --- /dev/null +++ b/local/patches/kernel/P8-work-stealing.patch @@ -0,0 +1,190 @@ +diff --git a/src/percpu.rs b/src/percpu.rs +--- a/src/percpu.rs ++++ b/src/percpu.rs +@@ -100,6 +100,14 @@ static ALL_PERCPU_BLOCKS: [AtomicPtr; MAX_CPU_COUNT as usize] = + pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) { + ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release) + } ++ ++pub fn get_percpu_block(id: LogicalCpuId) -> Option<&'static PercpuBlock> { ++ unsafe { ++ ALL_PERCPU_BLOCKS[id.get() as usize] ++ .load(Ordering::Acquire) ++ .as_ref() ++ } ++} + + pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> { +diff --git a/src/context/switch.rs b/src/context/switch.rs +--- a/src/context/switch.rs ++++ b/src/context/switch.rs +@@ -7,15 +7,15 @@ use crate::{ + self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard, + Context, ContextLock, SchedPolicy, WeakContextRef, RUN_QUEUE_COUNT, + }, +- cpu_set::LogicalCpuId, ++ cpu_set::{LogicalCpuId, LogicalCpuSet}, + cpu_stats::{self, CpuState}, +- percpu::{PerCpuSched, PercpuBlock}, ++ percpu::{get_percpu_block, PerCpuSched, PercpuBlock}, + sync::{ArcRwLockWriteGuard, CleanLockToken, LockToken, L1, L4}, + }; + use alloc::{sync::Arc, vec::Vec}; + use core::{ + cell::{Cell, RefCell}, + hint, mem, +- sync::atomic::Ordering, ++ sync::atomic::{AtomicUsize, Ordering}, + }; + use syscall::PtraceFlags; +@@ ++static SCHED_STEAL_COUNT: AtomicUsize = AtomicUsize::new(0); ++ ++fn assign_context_to_cpu(context: &mut Context, cpu_id: LogicalCpuId) { ++ context.sched_affinity = LogicalCpuSet::empty(); ++ context.sched_affinity.atomic_set(cpu_id); ++} +@@ ++fn pop_movable_context( ++ token: &mut CleanLockToken, ++ queues: &mut [alloc::collections::VecDeque; RUN_QUEUE_COUNT], ++ target_cpu: LogicalCpuId, ++ switch_time: u128, ++ idle_context: &Arc, ++) -> Option<(usize, WeakContextRef)> { ++ for prio in 0..RUN_QUEUE_COUNT { ++ let len = queues[prio].len(); ++ for _ in 0..len { ++ let Some(context_ref) = queues[prio].pop_front() else { ++ break; ++ }; ++ let Some(context_lock) = context_ref.upgrade() else { ++ continue; ++ }; ++ if Arc::ptr_eq(&context_lock, idle_context) { ++ queues[prio].push_back(context_ref); ++ continue; ++ } ++ ++ let mut context_guard = unsafe { context_lock.write_arc() }; ++ let sw = unsafe { update_stealable(&mut context_guard, switch_time) }; ++ if let UpdateResult::CanSwitch = sw { ++ assign_context_to_cpu(&mut context_guard, target_cpu); ++ let moved_ref = WeakContextRef(Arc::downgrade(ArcContextLockWriteGuard::rwlock( ++ &context_guard, ++ ))); ++ drop(context_guard); ++ return Some((prio, moved_ref)); ++ } ++ ++ if matches!(sw, UpdateResult::Blocked) { ++ idle_contexts(token.downgrade()).push_back(context_ref); ++ } else { ++ queues[prio].push_back(context_ref); ++ } ++ } ++ } ++ ++ None ++} ++ ++fn steal_work( ++ token: &mut CleanLockToken, ++ cpu_id: LogicalCpuId, ++ switch_time: u128, ++) -> Option { ++ let cpu_count = crate::cpu_count(); ++ if cpu_count <= 1 { ++ return None; ++ } ++ ++ for offset in 1..cpu_count { ++ let victim_id = LogicalCpuId::new((cpu_id.get() + offset) % cpu_count); ++ let Some(victim) = get_percpu_block(victim_id) else { ++ continue; ++ }; ++ ++ let victim_idle = victim.switch_internals.idle_context(); ++ let mut victim_lock = SchedQueuesLock::new(&victim.sched); ++ let victim_queues = unsafe { victim_lock.queues_mut() }; ++ ++ for prio in 0..RUN_QUEUE_COUNT { ++ let len = victim_queues[prio].len(); ++ for _ in 0..len { ++ let Some(context_ref) = victim_queues[prio].pop_front() else { ++ break; ++ }; ++ let Some(context_lock) = context_ref.upgrade() else { ++ continue; ++ }; ++ if Arc::ptr_eq(&context_lock, &victim_idle) { ++ victim_queues[prio].push_back(context_ref); ++ continue; ++ } ++ ++ let mut context_guard = unsafe { context_lock.write_arc() }; ++ let sw = unsafe { update_stealable(&mut context_guard, switch_time) }; ++ if let UpdateResult::CanSwitch = sw { ++ assign_context_to_cpu(&mut context_guard, cpu_id); ++ SCHED_STEAL_COUNT.fetch_add(1, Ordering::Relaxed); ++ return Some(context_guard); ++ } ++ ++ if matches!(sw, UpdateResult::Blocked) { ++ idle_contexts(token.downgrade()).push_back(context_ref); ++ } else { ++ victim_queues[prio].push_back(context_ref); ++ } ++ } ++ } ++ } ++ ++ None ++} ++ ++unsafe fn update_stealable(context: &mut Context, switch_time: u128) -> UpdateResult { ++ if context.running { ++ return UpdateResult::Skip; ++ } ++ if context.status.is_soft_blocked() ++ && let Some(wake) = context.wake ++ && switch_time >= wake ++ { ++ context.wake = None; ++ context.unblock_no_ipi(); ++ } ++ if context.status.is_runnable() { ++ UpdateResult::CanSwitch ++ } else { ++ UpdateResult::Blocked ++ } ++} +@@ -360,6 +469,10 @@ fn wakeup_contexts(token: &mut CleanLockToken, percpu: &PercpuBlock, switch_time + let mut sched_lock = SchedQueuesLock::new(&percpu.sched); + let run_queues = unsafe { sched_lock.queues_mut() }; + for (prio, context_ref) in wakeups { ++ if let Some(context_lock) = context_ref.upgrade() { ++ let mut context_guard = unsafe { context_lock.write_arc() }; ++ assign_context_to_cpu(&mut context_guard, percpu.cpu_id); ++ } + run_queues[prio].push_back(context_ref); + } + } +@@ -559,6 +672,16 @@ fn select_next_context( + ); + return Ok(Some(next_context_guard)); + } ++ ++ if let Some(next_context_guard) = steal_work(token, cpu_id, switch_time) { ++ queue_previous_context( ++ token, ++ percpu, ++ &prev_context_lock, ++ prev_context_guard, ++ &idle_context, ++ ); ++ return Ok(Some(next_context_guard)); ++ } + + let global_next = { + let contexts_data = run_contexts(token.token()); diff --git a/local/patches/kernel/P9-futex-pi-cas-fix.patch b/local/patches/kernel/P9-futex-pi-cas-fix.patch new file mode 100644 index 00000000..74e45918 --- /dev/null +++ b/local/patches/kernel/P9-futex-pi-cas-fix.patch @@ -0,0 +1,21 @@ +diff --git a/src/syscall/futex.rs b/src/syscall/futex.rs +--- a/src/syscall/futex.rs ++++ b/src/syscall/futex.rs +@@ +- let futex_atomic = futex_atomic_u32(locked_physaddr); +- let mut current = futex_atomic.load(Ordering::SeqCst); ++ let futex_atomic = futex_atomic_u32(locked_physaddr); ++ let mut current = futex_atomic.load(Ordering::SeqCst); ++ let queue = futexes ++ .entry(locked_physaddr) ++ .or_insert_with(FutexQueue::default); + + loop { + let owner_tid = current & FUTEX_TID_MASK; +- let queue = futexes +- .entry(locked_physaddr) +- .or_insert_with(FutexQueue::default); + let desired_waiters = if queue.waiters.is_empty() { + 0 + } else { + FUTEX_WAITERS diff --git a/local/patches/kernel/P9-numa-topology.patch b/local/patches/kernel/P9-numa-topology.patch new file mode 100644 index 00000000..4ab16e51 --- /dev/null +++ b/local/patches/kernel/P9-numa-topology.patch @@ -0,0 +1,68 @@ +diff --git a/src/numa.rs b/src/numa.rs +new file mode 100644 +index 0000000..40c5a06 +--- /dev/null ++++ b/src/numa.rs +@@ -0,0 +1,62 @@ ++/// NUMA topology hints for the kernel scheduler. ++/// NUMA discovery (SRAT/SLIT parsing) is performed by a userspace daemon ++/// (numad) via /scheme/acpi/, then pushed to the kernel via scheme:numa. ++/// The kernel stores a lightweight copy for O(1) scheduling lookups. ++use crate::cpu_set::{LogicalCpuId, LogicalCpuSet}; ++use core::sync::atomic::{AtomicBool, Ordering}; ++ ++const MAX_NUMA_NODES: usize = 8; ++ ++#[derive(Clone, Debug)] ++pub struct NumaHint { ++ pub node_id: u8, ++ pub cpus: LogicalCpuSet, ++} ++ ++pub struct NumaTopology { ++ pub nodes: [Option; MAX_NUMA_NODES], ++ pub initialized: AtomicBool, ++} ++ ++impl NumaTopology { ++ pub const fn new() -> Self { ++ const NONE: Option = None; ++ Self { ++ nodes: [NONE; MAX_NUMA_NODES], ++ initialized: AtomicBool::new(false), ++ } ++ } ++ ++ pub fn node_for_cpu(&self, cpu: LogicalCpuId) -> Option { ++ for node in self.nodes.iter().flatten() { ++ if node.cpus.contains(cpu) { ++ return Some(node.node_id); ++ } ++ } ++ None ++ } ++ ++ pub fn same_node(&self, cpu1: LogicalCpuId, cpu2: LogicalCpuId) -> bool { ++ self.node_for_cpu(cpu1) == self.node_for_cpu(cpu2) ++ } ++} ++ ++static mut NUMA_TOPOLOGY: NumaTopology = NumaTopology::new(); ++ ++pub fn topology() -> &'static NumaTopology { ++ unsafe { &NUMA_TOPOLOGY } ++} ++ ++pub fn init_default() { ++ let topo = topology(); ++ if topo.initialized.swap(true, Ordering::AcqRel) { ++ return; ++ } ++ unsafe { ++ let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY); ++ topo_mut.nodes[0] = Some(NumaHint { ++ node_id: 0, ++ cpus: LogicalCpuSet::all(), ++ }); ++ } ++} diff --git a/local/patches/kernel/P9-proc-lock-ordering.patch b/local/patches/kernel/P9-proc-lock-ordering.patch new file mode 100644 index 00000000..d6b30978 --- /dev/null +++ b/local/patches/kernel/P9-proc-lock-ordering.patch @@ -0,0 +1,41 @@ +diff --git a/src/scheme/proc.rs b/src/scheme/proc.rs +--- a/src/scheme/proc.rs ++++ b/src/scheme/proc.rs +@@ -450,6 +450,7 @@ impl KernelScheme for ProcScheme { + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { ++ let mut inner_token = unsafe { CleanLockToken::new() }; + let handle = HANDLES + .write(token.token()) + .remove(&id) +@@ -478,9 +479,7 @@ impl KernelScheme for ProcScheme { + ))] + regs.set_arg1(arg1); + +- // TODO: Lock ordering violation +- let mut token = unsafe { CleanLockToken::new() }; +- Ok(context.set_addr_space(Some(new), token.downgrade())) ++ Ok(context.set_addr_space(Some(new), inner_token.downgrade())) + })?; + if let Some(old_ctx) = old_ctx + && let Some(addrspace) = Arc::into_inner(old_ctx) +@@ -518,6 +517,7 @@ impl KernelScheme for ProcScheme { + consume: bool, + token: &mut CleanLockToken, + ) -> Result { ++ let mut inner_token = unsafe { CleanLockToken::new() }; + let handle = HANDLES + .read(token.token()) + .get(&id) +@@ -609,9 +609,7 @@ impl KernelScheme for ProcScheme { + }; + // TODO: Allocated or AllocatedShared? + let addrsp = AddrSpace::current()?; +- // TODO: Lock ordering violation +- let mut token = unsafe { CleanLockToken::new() }; +- let page = addrsp.acquire_write(token.downgrade()).mmap_anywhere( ++ let page = addrsp.acquire_write(inner_token.downgrade()).mmap_anywhere( + &addrsp, + NonZeroUsize::new(1).unwrap(), + MapFlags::PROT_READ | MapFlags::PROT_WRITE, diff --git a/local/patches/local/P3-firmware-fallback-cache.patch b/local/patches/local/P3-firmware-fallback-cache.patch new file mode 100644 index 00000000..bbe683ab --- /dev/null +++ b/local/patches/local/P3-firmware-fallback-cache.patch @@ -0,0 +1,1414 @@ +diff --git a/local/recipes/system/firmware-loader/source/Cargo.toml b/local/recipes/system/firmware-loader/source/Cargo.toml +index 0e273efcd..6f91edb3a 100644 +--- a/local/recipes/system/firmware-loader/source/Cargo.toml ++++ b/local/recipes/system/firmware-loader/source/Cargo.toml +@@ -10,3 +10,4 @@ redox_scheme = { package = "redox-scheme", version = "0.11" } + libredox = "0.1" + log = { version = "0.4", features = ["std"] } + thiserror = "2" ++toml = "0.8" +diff --git a/local/recipes/system/firmware-loader/source/src/blob.rs b/local/recipes/system/firmware-loader/source/src/blob.rs +index 911b6e5b0..349d29657 100644 +--- a/local/recipes/system/firmware-loader/source/src/blob.rs ++++ b/local/recipes/system/firmware-loader/source/src/blob.rs +@@ -1,11 +1,18 @@ +-use std::collections::HashMap; ++use std::collections::{HashMap, HashSet}; ++use std::ffi::OsStr; + use std::fs; +-use std::path::{Path, PathBuf}; ++use std::io::ErrorKind; ++use std::path::{Component, Path, PathBuf}; ++use std::sync::mpsc::{self, RecvTimeoutError}; + use std::sync::{Arc, Mutex}; ++use std::time::{Duration, Instant, UNIX_EPOCH}; + + use log::{info, warn}; + use thiserror::Error; + ++const DEFAULT_FALLBACKS_DIR: &str = "/etc/firmware-fallbacks.d"; ++const DEFAULT_CACHE_DIR: &str = "/var/lib/firmware/cache"; ++ + #[allow(dead_code)] + #[derive(Error, Debug)] + pub enum BlobError { +@@ -21,6 +28,8 @@ pub enum BlobError { + #[source] + source: std::io::Error, + }, ++ #[error("firmware load timed out for {key} after {timeout:?}")] ++ LoadTimeout { key: String, timeout: Duration }, + } + + #[allow(dead_code)] +@@ -30,20 +39,365 @@ pub struct FirmwareBlob { + pub path: PathBuf, + } + ++#[derive(Clone, Debug, Eq, PartialEq)] ++struct CacheMetadata { ++ requested_key: String, ++ source_key: String, ++ source_mtime_ns: u128, ++ source_len: u64, ++} ++ ++impl CacheMetadata { ++ #[allow(dead_code)] ++ fn placeholder(key: &str, len: u64) -> Self { ++ Self { ++ requested_key: key.to_string(), ++ source_key: key.to_string(), ++ source_mtime_ns: 0, ++ source_len: len, ++ } ++ } ++ ++ fn from_source(requested_key: &str, source_key: &str, signature: &SourceSignature) -> Self { ++ Self { ++ requested_key: requested_key.to_string(), ++ source_key: source_key.to_string(), ++ source_mtime_ns: signature.modified_ns, ++ source_len: signature.len, ++ } ++ } ++ ++ fn matches(&self, requested_key: &str, source_key: &str, signature: &SourceSignature) -> bool { ++ self.requested_key == requested_key ++ && self.source_key == source_key ++ && self.source_mtime_ns == signature.modified_ns ++ && self.source_len == signature.len ++ } ++} ++ ++#[derive(Clone)] ++struct CachedBlob { ++ data: Arc>, ++ metadata: CacheMetadata, ++} ++ ++#[derive(Clone, Debug, Eq, PartialEq)] ++struct SourceSignature { ++ modified_ns: u128, ++ len: u64, ++} ++ ++pub struct FirmwareFallback { ++ fallbacks: HashMap>, ++} ++ ++impl FirmwareFallback { ++ pub fn load_defaults() -> Self { ++ Self::load_from_dir(Path::new(DEFAULT_FALLBACKS_DIR)) ++ } ++ ++ fn load_from_dir(dir: &Path) -> Self { ++ let mut fallbacks = Self::builtins(); ++ ++ let entries = match fs::read_dir(dir) { ++ Ok(entries) => entries, ++ Err(err) if err.kind() == ErrorKind::NotFound => return fallbacks, ++ Err(err) => { ++ warn!( ++ "firmware-loader: failed to read fallback directory {}: {}", ++ dir.display(), ++ err ++ ); ++ return fallbacks; ++ } ++ }; ++ ++ let mut paths = Vec::new(); ++ for entry in entries { ++ match entry { ++ Ok(entry) => { ++ let path = entry.path(); ++ if path.extension() == Some(OsStr::new("toml")) { ++ paths.push(path); ++ } ++ } ++ Err(err) => warn!( ++ "firmware-loader: skipping unreadable fallback entry in {}: {}", ++ dir.display(), ++ err ++ ), ++ } ++ } ++ paths.sort(); ++ ++ for path in paths { ++ let contents = match fs::read_to_string(&path) { ++ Ok(contents) => contents, ++ Err(err) => { ++ warn!( ++ "firmware-loader: failed to read fallback file {}: {}", ++ path.display(), ++ err ++ ); ++ continue; ++ } ++ }; ++ ++ match parse_fallback_file(&contents) { ++ Ok(loaded) => { ++ for (pattern, variants) in loaded { ++ if variants.is_empty() { ++ continue; ++ } ++ fallbacks ++ .fallbacks ++ .entry(pattern) ++ .or_default() ++ .extend(variants); ++ } ++ } ++ Err(err) => warn!( ++ "firmware-loader: failed to parse fallback file {}: {}", ++ path.display(), ++ err ++ ), ++ } ++ } ++ ++ fallbacks ++ } ++ ++ pub fn get_fallback_chain(&self, key: &str) -> Vec { ++ let mut chain = Vec::new(); ++ let mut seen = HashSet::new(); ++ ++ if let Some(exact) = self.fallbacks.get(key) { ++ append_variants(key, "", exact, &mut seen, &mut chain); ++ } ++ ++ let mut patterns: Vec<&str> = self.fallbacks.keys().map(String::as_str).collect(); ++ patterns.sort_unstable(); ++ ++ for pattern in patterns { ++ if pattern == key { ++ continue; ++ } ++ ++ if let Some(capture) = pattern_capture(pattern, key) { ++ if let Some(variants) = self.fallbacks.get(pattern) { ++ append_variants(key, capture, variants, &mut seen, &mut chain); ++ } ++ } ++ } ++ ++ chain ++ } ++ ++ fn builtins() -> Self { ++ let mut fallbacks = HashMap::new(); ++ fallbacks.insert( ++ "amdgpu/dmcub_dcn31.bin".to_string(), ++ vec![ ++ "amdgpu/dmcub_dcn30.bin".to_string(), ++ "amdgpu/dmcub_dcn20.bin".to_string(), ++ ], ++ ); ++ fallbacks.insert( ++ "amdgpu/dmcub_dcn30.bin".to_string(), ++ vec!["amdgpu/dmcub_dcn20.bin".to_string()], ++ ); ++ fallbacks.insert( ++ "iwlwifi-*-92.ucode".to_string(), ++ vec![ ++ "iwlwifi-*-83.ucode".to_string(), ++ "iwlwifi-*-77.ucode".to_string(), ++ ], ++ ); ++ fallbacks.insert( ++ "iwlwifi-*-83.ucode".to_string(), ++ vec!["iwlwifi-*-77.ucode".to_string()], ++ ); ++ ++ Self { fallbacks } ++ } ++} ++ ++pub struct FirmwareCache { ++ cache_dir: PathBuf, ++} ++ ++impl FirmwareCache { ++ pub fn new(cache_dir: &Path) -> Self { ++ Self { ++ cache_dir: cache_dir.to_path_buf(), ++ } ++ } ++ ++ #[allow(dead_code)] ++ pub fn get(&self, key: &str) -> Option> { ++ self.load_entry(key, None, None) ++ .ok() ++ .flatten() ++ .map(|entry| entry.data.as_ref().clone()) ++ } ++ ++ #[allow(dead_code)] ++ pub fn store(&self, key: &str, data: &[u8]) -> Result<(), std::io::Error> { ++ self.store_entry( ++ key, ++ data, ++ &CacheMetadata::placeholder(key, data.len() as u64), ++ ) ++ } ++ ++ pub fn invalidate(&self, key: &str) { ++ let Some(path) = self.cache_path(key) else { ++ return; ++ }; ++ ++ for cache_file in [path.clone(), metadata_path_for(&path)] { ++ match fs::remove_file(&cache_file) { ++ Ok(()) => {} ++ Err(err) if err.kind() == ErrorKind::NotFound => {} ++ Err(err) => warn!( ++ "firmware-loader: failed to invalidate persistent cache {}: {}", ++ cache_file.display(), ++ err ++ ), ++ } ++ } ++ } ++ ++ fn contains(&self, key: &str) -> bool { ++ self.cache_path(key).is_some_and(|path| path.exists()) ++ } ++ ++ fn load_entry( ++ &self, ++ key: &str, ++ started_at: Option, ++ timeout: Option, ++ ) -> Result, BlobError> { ++ let Some(path) = self.cache_path(key) else { ++ warn!( ++ "firmware-loader: refusing to read invalid persistent cache key {}", ++ key ++ ); ++ return Ok(None); ++ }; ++ ++ let metadata_path = metadata_path_for(&path); ++ if !path.exists() { ++ return Ok(None); ++ } ++ ++ let metadata = match load_cache_metadata(&metadata_path, key, started_at, timeout) { ++ Ok(metadata) => { ++ let Some(metadata) = metadata else { ++ self.invalidate(key); ++ return Ok(None); ++ }; ++ metadata ++ } ++ Err(BlobError::LoadTimeout { .. }) => { ++ return Err(BlobError::LoadTimeout { ++ key: key.to_string(), ++ timeout: timeout.unwrap_or_default(), ++ }) ++ } ++ Err(err) => { ++ warn!( ++ "firmware-loader: failed to load metadata for persistent cache {}: {}", ++ metadata_path.display(), ++ err ++ ); ++ self.invalidate(key); ++ return Ok(None); ++ } ++ }; ++ ++ match read_path_bytes(&path, key, started_at, timeout) { ++ Ok(data) => Ok(Some(CachedBlob { ++ data: Arc::new(data), ++ metadata, ++ })), ++ Err(BlobError::ReadError { .. }) => { ++ warn!( ++ "firmware-loader: failed to read persistent cache {}, invalidating entry", ++ path.display() ++ ); ++ self.invalidate(key); ++ Ok(None) ++ } ++ Err(err) => Err(err), ++ } ++ } ++ ++ fn store_entry( ++ &self, ++ key: &str, ++ data: &[u8], ++ metadata: &CacheMetadata, ++ ) -> Result<(), std::io::Error> { ++ let path = self.cache_path(key).ok_or_else(|| { ++ std::io::Error::new( ++ ErrorKind::InvalidInput, ++ format!("invalid cache key for persistent firmware cache: {key}"), ++ ) ++ })?; ++ let metadata_path = metadata_path_for(&path); ++ ++ if let Some(parent) = path.parent() { ++ fs::create_dir_all(parent)?; ++ } ++ ++ fs::write(&path, data)?; ++ write_cache_metadata(&metadata_path, metadata) ++ } ++ ++ fn cache_path(&self, key: &str) -> Option { ++ if !is_safe_key(key) { ++ return None; ++ } ++ ++ let relative = Path::new(key); ++ if relative.is_absolute() { ++ return None; ++ } ++ ++ if relative.components().any(|component| { ++ matches!( ++ component, ++ Component::ParentDir ++ | Component::CurDir ++ | Component::Prefix(_) ++ | Component::RootDir ++ ) ++ }) { ++ return None; ++ } ++ ++ Some(self.cache_dir.join(relative)) ++ } ++} ++ + #[allow(dead_code)] + pub struct FirmwareRegistry { + base_dir: PathBuf, + blobs: HashMap, +- cache: Arc>>>>, ++ cache: Arc>>, ++ persistent_cache: FirmwareCache, ++ fallbacks: FirmwareFallback, + } + + impl FirmwareRegistry { + pub fn empty(base_dir: &Path) -> Self { +- FirmwareRegistry { +- base_dir: base_dir.to_path_buf(), +- blobs: HashMap::new(), +- cache: Arc::new(Mutex::new(HashMap::new())), +- } ++ Self::with_components( ++ base_dir, ++ HashMap::new(), ++ FirmwareCache::new(Path::new(DEFAULT_CACHE_DIR)), ++ FirmwareFallback::load_defaults(), ++ ) + } + + pub fn new(base_dir: &Path) -> Result { +@@ -58,11 +412,12 @@ impl FirmwareRegistry { + base_dir.display() + ); + +- Ok(FirmwareRegistry { +- base_dir: base_dir.to_path_buf(), ++ Ok(Self::with_components( ++ base_dir, + blobs, +- cache: Arc::new(Mutex::new(HashMap::new())), +- }) ++ FirmwareCache::new(Path::new(DEFAULT_CACHE_DIR)), ++ FirmwareFallback::load_defaults(), ++ )) + } + + #[allow(dead_code)] +@@ -73,56 +428,236 @@ impl FirmwareRegistry { + #[allow(dead_code)] + pub fn contains(&self, key: &str) -> bool { + self.blobs.contains_key(key) ++ || self.persistent_cache.contains(key) ++ || self ++ .fallbacks ++ .get_fallback_chain(key) ++ .into_iter() ++ .any(|candidate| { ++ self.blobs.contains_key(&candidate) ++ || self.persistent_cache.contains(&candidate) ++ }) + } + + #[allow(dead_code)] + pub fn load(&self, key: &str) -> Result>, BlobError> { ++ self.load_internal(key, None, None) ++ } ++ ++ pub fn load_with_timeout( ++ &self, ++ key: &str, ++ started_at: Instant, ++ timeout: Duration, ++ ) -> Result>, BlobError> { ++ self.load_internal(key, Some(started_at), Some(timeout)) ++ } ++ ++ pub fn len(&self) -> usize { ++ self.blobs.len() ++ } ++ ++ #[allow(dead_code)] ++ pub fn list_keys(&self) -> Vec<&str> { ++ self.blobs.keys().map(|s| s.as_str()).collect() ++ } ++ ++ fn with_components( ++ base_dir: &Path, ++ blobs: HashMap, ++ persistent_cache: FirmwareCache, ++ fallbacks: FirmwareFallback, ++ ) -> Self { ++ Self { ++ base_dir: base_dir.to_path_buf(), ++ blobs, ++ cache: Arc::new(Mutex::new(HashMap::new())), ++ persistent_cache, ++ fallbacks, ++ } ++ } ++ ++ fn load_internal( ++ &self, ++ key: &str, ++ started_at: Option, ++ timeout: Option, ++ ) -> Result>, BlobError> { ++ if let Some(entry) = self.load_validated_persistent_cache(key, started_at, timeout)? { ++ self.insert_memory_cache(key, entry.clone()); ++ info!( ++ "firmware-loader: loaded firmware blob {} ({} bytes) from persistent cache", ++ key, ++ entry.data.len() ++ ); ++ return Ok(entry.data); ++ } ++ ++ if let Some(entry) = self.memory_cache_get_validated(key, started_at, timeout)? { ++ return Ok(entry.data); ++ } ++ ++ let mut last_not_found = BlobError::FirmwareNotFound(self.base_dir.join(key)); ++ for candidate in ++ std::iter::once(key.to_string()).chain(self.fallbacks.get_fallback_chain(key)) + { +- let cache = self.cache.lock().map_err(|e| BlobError::ReadError { +- path: self.base_dir.clone(), +- source: std::io::Error::new(std::io::ErrorKind::Other, e.to_string()), +- })?; +- if let Some(data) = cache.get(key) { +- return Ok(Arc::clone(data)); ++ match self.read_from_filesystem(&candidate, key, started_at, timeout) { ++ Ok(entry) => { ++ self.insert_memory_cache(key, entry.clone()); ++ ++ if let Err(err) = self.persistent_cache.store_entry( ++ key, ++ entry.data.as_slice(), ++ &entry.metadata, ++ ) { ++ warn!( ++ "firmware-loader: failed to persist cache entry for {}: {}", ++ key, err ++ ); ++ } ++ ++ if candidate != key { ++ info!( ++ "firmware-loader: resolved firmware {} via fallback {} ({} bytes)", ++ key, ++ candidate, ++ entry.data.len() ++ ); ++ } ++ ++ return Ok(entry.data); ++ } ++ Err(BlobError::FirmwareNotFound(path)) => { ++ last_not_found = BlobError::FirmwareNotFound(path); ++ } ++ Err(err) => return Err(err), + } + } + +- let blob = self.blobs.get(key).ok_or_else(|| { +- warn!("firmware-loader: requested firmware not found: {}", key); +- BlobError::FirmwareNotFound(self.base_dir.join(key)) +- })?; ++ warn!("firmware-loader: requested firmware not found: {}", key); ++ Err(last_not_found) ++ } + +- let data = fs::read(&blob.path).map_err(|e| BlobError::ReadError { +- path: blob.path.clone(), +- source: e, +- })?; ++ fn load_validated_persistent_cache( ++ &self, ++ key: &str, ++ started_at: Option, ++ timeout: Option, ++ ) -> Result, BlobError> { ++ let Some(entry) = self.persistent_cache.load_entry(key, started_at, timeout)? else { ++ return Ok(None); ++ }; + +- info!( +- "firmware-loader: loaded firmware blob {} ({} bytes) from {}", +- key, +- data.len(), +- blob.path.display() +- ); ++ if self.is_cached_entry_valid(key, &entry, started_at, timeout)? { ++ return Ok(Some(entry)); ++ } + +- let data = Arc::new(data); +- { +- let mut cache = self.cache.lock().map_err(|e| BlobError::ReadError { +- path: self.base_dir.clone(), +- source: std::io::Error::new(std::io::ErrorKind::Other, e.to_string()), +- })?; +- cache.insert(key.to_string(), Arc::clone(&data)); ++ self.persistent_cache.invalidate(key); ++ Ok(None) ++ } ++ ++ fn memory_cache_get_validated( ++ &self, ++ key: &str, ++ started_at: Option, ++ timeout: Option, ++ ) -> Result, BlobError> { ++ let entry = match self.cache.lock() { ++ Ok(cache) => cache.get(key).cloned(), ++ Err(err) => { ++ warn!( ++ "firmware-loader: in-memory cache poisoned while loading {}: {}", ++ key, err ++ ); ++ None ++ } ++ }; ++ ++ let Some(entry) = entry else { ++ return Ok(None); ++ }; ++ ++ if self.is_cached_entry_valid(key, &entry, started_at, timeout)? { ++ return Ok(Some(entry)); + } + +- Ok(data) ++ match self.cache.lock() { ++ Ok(mut cache) => { ++ cache.remove(key); ++ } ++ Err(err) => warn!( ++ "firmware-loader: failed to invalidate in-memory cache for {}: {}", ++ key, err ++ ), ++ } ++ ++ Ok(None) + } + +- pub fn len(&self) -> usize { +- self.blobs.len() ++ fn is_cached_entry_valid( ++ &self, ++ key: &str, ++ entry: &CachedBlob, ++ started_at: Option, ++ timeout: Option, ++ ) -> Result { ++ if let Some(exact_blob) = self.blobs.get(key) { ++ if entry.metadata.source_key != key { ++ return Ok(false); ++ } ++ ++ let signature = source_signature(&exact_blob.path, key, started_at, timeout)?; ++ return Ok(entry.metadata.matches(key, key, &signature)); ++ } ++ ++ if let Some(source_blob) = self.blobs.get(&entry.metadata.source_key) { ++ let signature = source_signature(&source_blob.path, key, started_at, timeout)?; ++ return Ok(entry ++ .metadata ++ .matches(key, &entry.metadata.source_key, &signature)); ++ } ++ ++ Ok(entry.metadata.requested_key == key) + } + +- #[allow(dead_code)] +- pub fn list_keys(&self) -> Vec<&str> { +- self.blobs.keys().map(|s| s.as_str()).collect() ++ fn insert_memory_cache(&self, key: &str, entry: CachedBlob) { ++ match self.cache.lock() { ++ Ok(mut cache) => { ++ cache.insert(key.to_string(), entry); ++ } ++ Err(err) => warn!( ++ "firmware-loader: failed to update in-memory cache for {}: {}", ++ key, err ++ ), ++ } ++ } ++ ++ fn read_from_filesystem( ++ &self, ++ source_key: &str, ++ requested_key: &str, ++ started_at: Option, ++ timeout: Option, ++ ) -> Result { ++ let blob = self ++ .blobs ++ .get(source_key) ++ .ok_or_else(|| BlobError::FirmwareNotFound(self.base_dir.join(source_key)))?; ++ ++ let signature = source_signature(&blob.path, requested_key, started_at, timeout)?; ++ let data = read_path_bytes(&blob.path, requested_key, started_at, timeout)?; ++ ++ info!( ++ "firmware-loader: loaded firmware blob {} ({} bytes) from {}", ++ source_key, ++ data.len(), ++ blob.path.display() ++ ); ++ ++ Ok(CachedBlob { ++ data: Arc::new(data), ++ metadata: CacheMetadata::from_source(requested_key, source_key, &signature), ++ }) + } + } + +@@ -190,37 +725,606 @@ fn is_metadata_file(file_name: &str) -> bool { + || file_name.starts_with("LICENSE") + } + ++fn is_safe_key(key: &str) -> bool { ++ !key.is_empty() ++ && !key.starts_with('.') ++ && !key.contains("..") ++ && key ++ .chars() ++ .all(|c| c.is_alphanumeric() || c == '/' || c == '-' || c == '_' || c == '.') ++} ++ ++fn load_cache_metadata( ++ path: &Path, ++ key: &str, ++ started_at: Option, ++ timeout: Option, ++) -> Result, BlobError> { ++ let bytes = match read_path_bytes(path, key, started_at, timeout) { ++ Ok(bytes) => bytes, ++ Err(BlobError::ReadError { source, .. }) if source.kind() == ErrorKind::NotFound => { ++ return Ok(None); ++ } ++ Err(err) => return Err(err), ++ }; ++ ++ let contents = String::from_utf8(bytes).map_err(|err| BlobError::ReadError { ++ path: path.to_path_buf(), ++ source: std::io::Error::new(ErrorKind::InvalidData, err), ++ })?; ++ ++ parse_cache_metadata(&contents) ++ .map(Some) ++ .map_err(|err| BlobError::ReadError { ++ path: path.to_path_buf(), ++ source: std::io::Error::new(ErrorKind::InvalidData, err), ++ }) ++} ++ ++fn write_cache_metadata(path: &Path, metadata: &CacheMetadata) -> Result<(), std::io::Error> { ++ fs::write(path, serialize_cache_metadata(metadata)) ++} ++ ++fn serialize_cache_metadata(metadata: &CacheMetadata) -> String { ++ format!( ++ "requested_key = {}\nsource_key = {}\nsource_mtime_ns = {}\nsource_len = {}\n", ++ toml::Value::String(metadata.requested_key.clone()), ++ toml::Value::String(metadata.source_key.clone()), ++ metadata.source_mtime_ns, ++ metadata.source_len, ++ ) ++} ++ ++fn parse_cache_metadata(contents: &str) -> Result { ++ let value = contents ++ .parse::() ++ .map_err(|err| err.to_string())?; ++ let table = value ++ .as_table() ++ .ok_or_else(|| "cache metadata must be a TOML table".to_string())?; ++ ++ let requested_key = table ++ .get("requested_key") ++ .and_then(toml::Value::as_str) ++ .ok_or_else(|| "cache metadata missing requested_key".to_string())?; ++ let source_key = table ++ .get("source_key") ++ .and_then(toml::Value::as_str) ++ .ok_or_else(|| "cache metadata missing source_key".to_string())?; ++ let source_mtime_ns = table ++ .get("source_mtime_ns") ++ .and_then(toml::Value::as_integer) ++ .ok_or_else(|| "cache metadata missing source_mtime_ns".to_string())?; ++ let source_len = table ++ .get("source_len") ++ .and_then(toml::Value::as_integer) ++ .ok_or_else(|| "cache metadata missing source_len".to_string())?; ++ ++ let source_mtime_ns = u128::try_from(source_mtime_ns) ++ .map_err(|_| "cache metadata source_mtime_ns must be non-negative".to_string())?; ++ let source_len = u64::try_from(source_len) ++ .map_err(|_| "cache metadata source_len must be non-negative".to_string())?; ++ ++ Ok(CacheMetadata { ++ requested_key: requested_key.to_string(), ++ source_key: source_key.to_string(), ++ source_mtime_ns, ++ source_len, ++ }) ++} ++ ++fn parse_fallback_file(contents: &str) -> Result>, String> { ++ let value = contents ++ .parse::() ++ .map_err(|err| err.to_string())?; ++ let table = value ++ .as_table() ++ .ok_or_else(|| "fallback config must be a TOML table".to_string())?; ++ ++ let mut fallbacks = HashMap::new(); ++ ++ for (key, value) in table { ++ if key == "fallbacks" { ++ let nested = value ++ .as_table() ++ .ok_or_else(|| "fallbacks must be a table of string arrays".to_string())?; ++ parse_fallback_entries(nested, &mut fallbacks)?; ++ continue; ++ } ++ ++ if value.is_array() { ++ parse_fallback_entry(key, value, &mut fallbacks)?; ++ } ++ } ++ ++ Ok(fallbacks) ++} ++ ++fn parse_fallback_entries( ++ entries: &toml::map::Map, ++ fallbacks: &mut HashMap>, ++) -> Result<(), String> { ++ for (key, value) in entries { ++ parse_fallback_entry(key, value, fallbacks)?; ++ } ++ Ok(()) ++} ++ ++fn parse_fallback_entry( ++ key: &str, ++ value: &toml::Value, ++ fallbacks: &mut HashMap>, ++) -> Result<(), String> { ++ let array = value ++ .as_array() ++ .ok_or_else(|| format!("fallback entry {key} must be an array"))?; ++ ++ let mut variants = Vec::with_capacity(array.len()); ++ for item in array { ++ let variant = item ++ .as_str() ++ .ok_or_else(|| format!("fallback entry {key} must contain only strings"))?; ++ variants.push(variant.to_string()); ++ } ++ ++ fallbacks.insert(key.to_string(), variants); ++ Ok(()) ++} ++ ++fn source_signature( ++ path: &Path, ++ key: &str, ++ started_at: Option, ++ timeout: Option, ++) -> Result { ++ let metadata = run_io_with_timeout(path, key, started_at, timeout, |path| fs::metadata(path))?; ++ let modified_ns = match metadata.modified() { ++ Ok(modified) => match modified.duration_since(UNIX_EPOCH) { ++ Ok(duration) => duration.as_nanos(), ++ Err(_) => 0, ++ }, ++ Err(_) => 0, ++ }; ++ ++ Ok(SourceSignature { ++ modified_ns, ++ len: metadata.len(), ++ }) ++} ++ ++fn read_path_bytes( ++ path: &Path, ++ key: &str, ++ started_at: Option, ++ timeout: Option, ++) -> Result, BlobError> { ++ run_io_with_timeout(path, key, started_at, timeout, |path| fs::read(path)) ++} ++ ++fn run_io_with_timeout( ++ path: &Path, ++ key: &str, ++ started_at: Option, ++ timeout: Option, ++ operation: F, ++) -> Result ++where ++ T: Send + 'static, ++ F: FnOnce(PathBuf) -> Result + Send + 'static, ++{ ++ let path_buf = path.to_path_buf(); ++ ++ if timeout.is_none() { ++ return operation(path_buf.clone()).map_err(|source| BlobError::ReadError { ++ path: path_buf, ++ source, ++ }); ++ } ++ ++ let total_timeout = timeout.unwrap_or_default(); ++ let remaining = remaining_timeout(key, started_at, timeout)?; ++ let (tx, rx) = mpsc::sync_channel(1); ++ ++ std::thread::spawn(move || { ++ let result = operation(path_buf.clone()); ++ let _ = tx.send((path_buf, result)); ++ }); ++ ++ match rx.recv_timeout(remaining) { ++ Ok((_path, Ok(value))) => Ok(value), ++ Ok((path, Err(source))) => Err(BlobError::ReadError { path, source }), ++ Err(RecvTimeoutError::Timeout) => Err(BlobError::LoadTimeout { ++ key: key.to_string(), ++ timeout: total_timeout, ++ }), ++ Err(RecvTimeoutError::Disconnected) => Err(BlobError::ReadError { ++ path: path.to_path_buf(), ++ source: std::io::Error::new( ++ ErrorKind::BrokenPipe, ++ "firmware-loader I/O worker disconnected unexpectedly", ++ ), ++ }), ++ } ++} ++ ++fn remaining_timeout( ++ key: &str, ++ started_at: Option, ++ timeout: Option, ++) -> Result { ++ match (started_at, timeout) { ++ (Some(started_at), Some(timeout)) => { ++ timeout ++ .checked_sub(started_at.elapsed()) ++ .ok_or_else(|| BlobError::LoadTimeout { ++ key: key.to_string(), ++ timeout, ++ }) ++ } ++ _ => Ok(Duration::MAX), ++ } ++} ++ ++fn metadata_path_for(path: &Path) -> PathBuf { ++ let mut file_name = path ++ .file_name() ++ .map(OsStr::to_os_string) ++ .unwrap_or_default(); ++ file_name.push(".meta"); ++ path.with_file_name(file_name) ++} ++ ++fn pattern_capture<'a>(pattern: &'a str, key: &'a str) -> Option<&'a str> { ++ if let Some(index) = pattern.find('*') { ++ let prefix = &pattern[..index]; ++ let suffix = &pattern[index + 1..]; ++ if !key.starts_with(prefix) || !key.ends_with(suffix) { ++ return None; ++ } ++ let capture_end = key.len().checked_sub(suffix.len())?; ++ if capture_end < prefix.len() { ++ return None; ++ } ++ return Some(&key[prefix.len()..capture_end]); ++ } ++ ++ if key == pattern { ++ return Some(""); ++ } ++ ++ if key.starts_with(pattern) { ++ let boundary = key.as_bytes().get(pattern.len()).copied(); ++ if matches!(boundary, Some(b'/')) { ++ return Some(""); ++ } ++ } ++ ++ None ++} ++ ++fn append_variants( ++ key: &str, ++ capture: &str, ++ variants: &[String], ++ seen: &mut HashSet, ++ chain: &mut Vec, ++) { ++ for variant in variants { ++ let candidate = if variant.contains('*') { ++ variant.replace('*', capture) ++ } else { ++ variant.clone() ++ }; ++ ++ if candidate != key && seen.insert(candidate.clone()) { ++ chain.push(candidate); ++ } ++ } ++} ++ + #[cfg(test)] + mod tests { + use super::*; ++ use std::ffi::CString; ++ #[cfg(unix)] ++ use std::os::unix::ffi::OsStrExt; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn temp_root(prefix: &str) -> PathBuf { +- let stamp = SystemTime::now() +- .duration_since(UNIX_EPOCH) +- .unwrap() +- .as_nanos(); ++ let stamp = match SystemTime::now().duration_since(UNIX_EPOCH) { ++ Ok(duration) => duration.as_nanos(), ++ Err(err) => panic!("system clock error while creating temp path: {err}"), ++ }; + let path = std::env::temp_dir().join(format!("{prefix}-{stamp}")); +- fs::create_dir_all(&path).unwrap(); ++ if let Err(err) = fs::create_dir_all(&path) { ++ panic!("failed to create temp directory {}: {err}", path.display()); ++ } + path + } + ++ fn registry_with_cache( ++ base_dir: &Path, ++ cache_dir: &Path, ++ fallbacks: FirmwareFallback, ++ ) -> FirmwareRegistry { ++ let blobs = match discover_firmware(base_dir) { ++ Ok(blobs) => blobs, ++ Err(err) => panic!( ++ "failed to discover firmware in {}: {err}", ++ base_dir.display() ++ ), ++ }; ++ ++ FirmwareRegistry::with_components(base_dir, blobs, FirmwareCache::new(cache_dir), fallbacks) ++ } ++ + #[test] + fn discovers_ucode_pnvm_and_bin_but_skips_license_metadata() { + let root = temp_root("rbos-fw-discover"); +- fs::write(root.join("demo.bin"), []).unwrap(); +- fs::write(root.join("iwlwifi-bz-b0-gf-a0-92.ucode"), []).unwrap(); +- fs::write(root.join("iwlwifi-bz-b0-gf-a0.pnvm"), []).unwrap(); +- fs::write(root.join("LICENCE.test"), "license").unwrap(); +- fs::write(root.join("WHENCE"), "meta").unwrap(); ++ if let Err(err) = fs::write(root.join("demo.bin"), []) { ++ panic!("failed to write demo firmware: {err}"); ++ } ++ if let Err(err) = fs::write(root.join("iwlwifi-bz-b0-gf-a0-92.ucode"), []) { ++ panic!("failed to write ucode firmware: {err}"); ++ } ++ if let Err(err) = fs::write(root.join("iwlwifi-bz-b0-gf-a0.pnvm"), []) { ++ panic!("failed to write pnvm firmware: {err}"); ++ } ++ if let Err(err) = fs::write(root.join("LICENCE.test"), "license") { ++ panic!("failed to write metadata file: {err}"); ++ } ++ if let Err(err) = fs::write(root.join("WHENCE"), "meta") { ++ panic!("failed to write whence file: {err}"); ++ } + +- let blobs = discover_firmware(&root).unwrap(); ++ let blobs = match discover_firmware(&root) { ++ Ok(blobs) => blobs, ++ Err(err) => panic!("failed to discover firmware: {err}"), ++ }; + assert!(blobs.contains_key("demo.bin")); + assert!(blobs.contains_key("iwlwifi-bz-b0-gf-a0-92.ucode")); + assert!(blobs.contains_key("iwlwifi-bz-b0-gf-a0.pnvm")); + assert!(!blobs.contains_key("LICENCE.test")); + assert!(!blobs.contains_key("WHENCE")); + +- fs::remove_dir_all(root).unwrap(); ++ if let Err(err) = fs::remove_dir_all(&root) { ++ panic!("failed to remove temp directory {}: {err}", root.display()); ++ } ++ } ++ ++ #[test] ++ fn fallback_chain_matches_builtin_wildcards() { ++ let fallbacks = FirmwareFallback::builtins(); ++ let chain = fallbacks.get_fallback_chain("iwlwifi-bz-b0-gf-a0-92.ucode"); ++ ++ assert_eq!( ++ chain, ++ vec![ ++ "iwlwifi-bz-b0-gf-a0-83.ucode".to_string(), ++ "iwlwifi-bz-b0-gf-a0-77.ucode".to_string(), ++ ] ++ ); ++ } ++ ++ #[test] ++ fn load_uses_fallback_and_populates_persistent_cache() { ++ let root = temp_root("rbos-fw-fallback"); ++ let cache = temp_root("rbos-fw-cache"); ++ let amdgpu = root.join("amdgpu"); ++ if let Err(err) = fs::create_dir_all(&amdgpu) { ++ panic!("failed to create amdgpu directory: {err}"); ++ } ++ if let Err(err) = fs::write(amdgpu.join("dmcub_dcn30.bin"), b"dcn30") { ++ panic!("failed to write fallback firmware: {err}"); ++ } ++ ++ let registry = registry_with_cache(&root, &cache, FirmwareFallback::builtins()); ++ let data = match registry.load("amdgpu/dmcub_dcn31.bin") { ++ Ok(data) => data, ++ Err(err) => panic!("failed to load fallback firmware: {err}"), ++ }; ++ ++ assert_eq!(data.as_slice(), b"dcn30"); ++ ++ let cached = match fs::read(cache.join("amdgpu/dmcub_dcn31.bin")) { ++ Ok(data) => data, ++ Err(err) => panic!("failed to read persistent cache file: {err}"), ++ }; ++ assert_eq!(cached, b"dcn30"); ++ ++ if let Err(err) = fs::remove_dir_all(&root) { ++ panic!("failed to remove temp directory {}: {err}", root.display()); ++ } ++ if let Err(err) = fs::remove_dir_all(&cache) { ++ panic!("failed to remove temp directory {}: {err}", cache.display()); ++ } ++ } ++ ++ #[test] ++ fn persistent_cache_survives_registry_restart() { ++ let root = temp_root("rbos-fw-restart"); ++ let cache = temp_root("rbos-fw-restart-cache"); ++ let amdgpu = root.join("amdgpu"); ++ if let Err(err) = fs::create_dir_all(&amdgpu) { ++ panic!("failed to create amdgpu directory: {err}"); ++ } ++ if let Err(err) = fs::write(amdgpu.join("dmcub_dcn30.bin"), b"persistent") { ++ panic!("failed to write fallback firmware: {err}"); ++ } ++ ++ let first_registry = registry_with_cache(&root, &cache, FirmwareFallback::builtins()); ++ if let Err(err) = first_registry.load("amdgpu/dmcub_dcn31.bin") { ++ panic!("failed to prime persistent cache: {err}"); ++ } ++ ++ if let Err(err) = fs::remove_file(amdgpu.join("dmcub_dcn30.bin")) { ++ panic!("failed to remove source firmware: {err}"); ++ } ++ ++ let restarted_registry = registry_with_cache(&root, &cache, FirmwareFallback::builtins()); ++ let data = match restarted_registry.load("amdgpu/dmcub_dcn31.bin") { ++ Ok(data) => data, ++ Err(err) => panic!("failed to load firmware from persistent cache: {err}"), ++ }; ++ assert_eq!(data.as_slice(), b"persistent"); ++ ++ if let Err(err) = fs::remove_dir_all(&root) { ++ panic!("failed to remove temp directory {}: {err}", root.display()); ++ } ++ if let Err(err) = fs::remove_dir_all(&cache) { ++ panic!("failed to remove temp directory {}: {err}", cache.display()); ++ } ++ } ++ ++ #[test] ++ fn persistent_cache_invalidates_when_exact_firmware_appears() { ++ let root = temp_root("rbos-fw-exact-wins"); ++ let cache = temp_root("rbos-fw-exact-cache"); ++ let amdgpu = root.join("amdgpu"); ++ if let Err(err) = fs::create_dir_all(&amdgpu) { ++ panic!("failed to create amdgpu directory: {err}"); ++ } ++ if let Err(err) = fs::write(amdgpu.join("dmcub_dcn30.bin"), b"fallback") { ++ panic!("failed to write fallback firmware: {err}"); ++ } ++ ++ let first_registry = registry_with_cache(&root, &cache, FirmwareFallback::builtins()); ++ if let Err(err) = first_registry.load("amdgpu/dmcub_dcn31.bin") { ++ panic!("failed to prime persistent cache: {err}"); ++ } ++ ++ if let Err(err) = fs::write(amdgpu.join("dmcub_dcn31.bin"), b"exact") { ++ panic!("failed to write exact firmware: {err}"); ++ } ++ ++ let restarted_registry = registry_with_cache(&root, &cache, FirmwareFallback::builtins()); ++ let data = match restarted_registry.load("amdgpu/dmcub_dcn31.bin") { ++ Ok(data) => data, ++ Err(err) => panic!("failed to reload firmware after exact install: {err}"), ++ }; ++ assert_eq!(data.as_slice(), b"exact"); ++ ++ if let Err(err) = fs::remove_dir_all(&root) { ++ panic!("failed to remove temp directory {}: {err}", root.display()); ++ } ++ if let Err(err) = fs::remove_dir_all(&cache) { ++ panic!("failed to remove temp directory {}: {err}", cache.display()); ++ } ++ } ++ ++ #[test] ++ fn persistent_cache_refreshes_when_source_blob_changes() { ++ let root = temp_root("rbos-fw-refresh"); ++ let cache = temp_root("rbos-fw-refresh-cache"); ++ if let Err(err) = fs::write(root.join("demo.bin"), b"old") { ++ panic!("failed to write initial firmware: {err}"); ++ } ++ ++ let first_registry = registry_with_cache(&root, &cache, FirmwareFallback::builtins()); ++ if let Err(err) = first_registry.load("demo.bin") { ++ panic!("failed to prime exact persistent cache: {err}"); ++ } ++ ++ std::thread::sleep(Duration::from_millis(5)); ++ if let Err(err) = fs::write(root.join("demo.bin"), b"new") { ++ panic!("failed to update firmware: {err}"); ++ } ++ ++ let restarted_registry = registry_with_cache(&root, &cache, FirmwareFallback::builtins()); ++ let data = match restarted_registry.load("demo.bin") { ++ Ok(data) => data, ++ Err(err) => panic!("failed to reload updated firmware: {err}"), ++ }; ++ assert_eq!(data.as_slice(), b"new"); ++ ++ if let Err(err) = fs::remove_dir_all(&root) { ++ panic!("failed to remove temp directory {}: {err}", root.display()); ++ } ++ if let Err(err) = fs::remove_dir_all(&cache) { ++ panic!("failed to remove temp directory {}: {err}", cache.display()); ++ } ++ } ++ ++ #[cfg(unix)] ++ #[test] ++ fn actual_blocking_read_times_out_within_budget() { ++ let root = temp_root("rbos-fw-timeout"); ++ let fifo = root.join("blocking.fifo"); ++ ++ let fifo_c_string = match CString::new(fifo.as_os_str().as_bytes()) { ++ Ok(value) => value, ++ Err(err) => panic!("failed to build fifo path string: {err}"), ++ }; ++ let result = unsafe { libc::mkfifo(fifo_c_string.as_ptr(), 0o644) }; ++ if result != 0 { ++ let errno = std::io::Error::last_os_error(); ++ panic!("failed to create fifo {}: {errno}", fifo.display()); ++ } ++ ++ let started = Instant::now(); ++ let result = read_path_bytes( ++ &fifo, ++ "blocking-firmware.bin", ++ Some(started), ++ Some(Duration::from_millis(100)), ++ ); ++ let elapsed = started.elapsed(); ++ ++ match result { ++ Err(BlobError::LoadTimeout { key, timeout }) => { ++ assert_eq!(key, "blocking-firmware.bin"); ++ assert_eq!(timeout, Duration::from_millis(100)); ++ } ++ other => panic!("expected timeout error, got {other:?}"), ++ } ++ assert!(elapsed < Duration::from_secs(1)); ++ ++ if let Err(err) = fs::remove_file(&fifo) { ++ panic!("failed to remove fifo {}: {err}", fifo.display()); ++ } ++ if let Err(err) = fs::remove_dir_all(&root) { ++ panic!("failed to remove temp directory {}: {err}", root.display()); ++ } ++ } ++ ++ #[test] ++ fn parse_fallback_file_supports_nested_and_top_level_rules() { ++ let parsed = match parse_fallback_file( ++ r#" ++"amdgpu/dmcub_dcn31.bin" = ["amdgpu/dmcub_dcn30.bin"] ++ ++[fallbacks] ++"iwlwifi-*-92.ucode" = ["iwlwifi-*-83.ucode"] ++"#, ++ ) { ++ Ok(parsed) => parsed, ++ Err(err) => panic!("failed to parse fallback config: {err}"), ++ }; ++ ++ assert_eq!( ++ parsed.get("amdgpu/dmcub_dcn31.bin"), ++ Some(&vec!["amdgpu/dmcub_dcn30.bin".to_string()]) ++ ); ++ assert_eq!( ++ parsed.get("iwlwifi-*-92.ucode"), ++ Some(&vec!["iwlwifi-*-83.ucode".to_string()]) ++ ); ++ } ++ ++ #[test] ++ fn parse_cache_metadata_round_trips() { ++ let metadata = CacheMetadata { ++ requested_key: "demo.bin".to_string(), ++ source_key: "demo.bin".to_string(), ++ source_mtime_ns: 123, ++ source_len: 456, ++ }; ++ ++ let parsed = match parse_cache_metadata(&serialize_cache_metadata(&metadata)) { ++ Ok(parsed) => parsed, ++ Err(err) => panic!("failed to parse cache metadata: {err}"), ++ }; ++ ++ assert_eq!(parsed, metadata); + } + } +diff --git a/local/recipes/system/firmware-loader/source/src/scheme.rs b/local/recipes/system/firmware-loader/source/src/scheme.rs +index 2a62b0737..d1d8cf499 100644 +--- a/local/recipes/system/firmware-loader/source/src/scheme.rs ++++ b/local/recipes/system/firmware-loader/source/src/scheme.rs +@@ -1,5 +1,6 @@ + use std::collections::BTreeMap; + use std::sync::Arc; ++use std::time::Instant; + + use log::warn; + use redox_scheme::scheme::SchemeSync; +@@ -12,6 +13,7 @@ use crate::blob::FirmwareRegistry; + + #[cfg_attr(not(target_os = "redox"), allow(dead_code))] + const SCHEME_ROOT_ID: usize = 1; ++const FIRMWARE_LOAD_TIMEOUT_MS: u64 = 5000; + + #[cfg_attr(not(target_os = "redox"), allow(dead_code))] + struct Handle { +@@ -94,15 +96,22 @@ impl SchemeSync for FirmwareScheme { + + let key = resolve_key(path).ok_or(Error::new(EISDIR))?; + +- if !self.registry.contains(&key) { +- warn!("firmware-loader: firmware not found: {}", path); +- return Err(Error::new(ENOENT)); +- } +- +- let data = self.registry.load(&key).map_err(|e| { +- warn!("firmware-loader: failed to load firmware '{}': {}", key, e); +- Error::new(ENOENT) +- })?; ++ let started_at = Instant::now(); ++ let data = self ++ .registry ++ .load_with_timeout( ++ &key, ++ started_at, ++ std::time::Duration::from_millis(FIRMWARE_LOAD_TIMEOUT_MS), ++ ) ++ .map_err(|e| { ++ warn!("firmware-loader: failed to load firmware '{}': {}", key, e); ++ match e { ++ crate::blob::BlobError::LoadTimeout { .. } => Error::new(ETIMEDOUT), ++ crate::blob::BlobError::ReadError { .. } => Error::new(EIO), ++ _ => Error::new(ENOENT), ++ } ++ })?; + + let id = self.next_id; + self.next_id += 1; +@@ -172,7 +181,7 @@ impl SchemeSync for FirmwareScheme { + stat.st_mode = MODE_FILE | 0o444; + stat.st_size = handle.data.len() as u64; + stat.st_blksize = 4096; +- stat.st_blocks = (handle.data.len() as u64 + 511) / 512; ++ stat.st_blocks = (handle.data.len() as u64).div_ceil(512); + stat.st_nlink = 1; + + Ok(()) +@@ -386,9 +395,7 @@ mod tests { + let mut scheme = FirmwareScheme::new(registry); + let ctx = test_ctx(); + +- let err = scheme +- .openat(SCHEME_ROOT_ID, "", 0, 0, &ctx) +- .unwrap_err(); ++ let err = scheme.openat(SCHEME_ROOT_ID, "", 0, 0, &ctx).unwrap_err(); + assert_eq!(err.errno, EISDIR); + let _ = fs::remove_dir_all(&dir); + } +@@ -399,9 +406,7 @@ mod tests { + let mut scheme = FirmwareScheme::new(registry); + let ctx = test_ctx(); + +- let err = scheme +- .openat(999, "test-blob.bin", 0, 0, &ctx) +- .unwrap_err(); ++ let err = scheme.openat(999, "test-blob.bin", 0, 0, &ctx).unwrap_err(); + assert_eq!(err.errno, EACCES); + let _ = fs::remove_dir_all(&dir); + } +@@ -641,9 +646,7 @@ mod tests { + let id = open_test_blob(&mut scheme); + let ctx = test_ctx(); + +- let flags = scheme +- .fevent(id, EventFlags::empty(), &ctx) +- .unwrap(); ++ let flags = scheme.fevent(id, EventFlags::empty(), &ctx).unwrap(); + assert_eq!(flags, EventFlags::empty()); + let _ = fs::remove_dir_all(&dir); + } diff --git a/local/patches/relibc/P3-barrier-smp-futex.patch b/local/patches/relibc/P3-barrier-smp-futex.patch new file mode 100644 index 00000000..65516948 --- /dev/null +++ b/local/patches/relibc/P3-barrier-smp-futex.patch @@ -0,0 +1,125 @@ +diff --git a/src/sync/barrier.rs b/src/sync/barrier.rs +index 6204a23..b5847b5 100644 +--- a/src/sync/barrier.rs ++++ b/src/sync/barrier.rs +@@ -1,18 +1,34 @@ +-use core::num::NonZeroU32; ++use core::{ ++ num::NonZeroU32, ++ sync::atomic::{AtomicU32, Ordering}, ++}; + + pub struct Barrier { + original_count: NonZeroU32, + // 4 + lock: crate::sync::Mutex, + // 16 +- cvar: crate::header::pthread::RlctCond, ++ cvar: FutexState, + // 24 + } + #[derive(Debug)] + struct Inner { +- count: u32, +- // TODO: Overflows might be problematic... 64-bit? +- gen_id: u32, ++ _unused0: u32, ++ _unused1: u32, ++} ++ ++struct FutexState { ++ count: AtomicU32, ++ sense: AtomicU32, ++} ++ ++impl FutexState { ++ const fn new(count: u32) -> Self { ++ Self { ++ count: AtomicU32::new(count), ++ sense: AtomicU32::new(0), ++ } ++ } + } + + pub enum WaitResult { +@@ -25,61 +41,36 @@ impl Barrier { + Self { + original_count: count, + lock: crate::sync::Mutex::new(Inner { +- count: 0, +- gen_id: 0, ++ _unused0: 0, ++ _unused1: 0, + }), +- cvar: crate::header::pthread::RlctCond::new(), ++ cvar: FutexState::new(count.get()), + } + } + pub fn wait(&self) -> WaitResult { +- let mut guard = self.lock.lock(); +- let gen_id = guard.gen_id; +- +- guard.count += 1; +- +- if guard.count == self.original_count.get() { +- guard.gen_id = guard.gen_id.wrapping_add(1); +- guard.count = 0; +- if let Ok(()) = self.cvar.broadcast() {}; // TODO handle error ++ let _ = &self.lock; ++ let sense = self.cvar.sense.load(Ordering::Acquire); + +- drop(guard); ++ if self.cvar.count.fetch_sub(1, Ordering::AcqRel) == 1 { ++ self.cvar ++ .count ++ .store(self.original_count.get(), Ordering::Relaxed); ++ self.cvar ++ .sense ++ .store(sense.wrapping_add(1), Ordering::Release); ++ crate::sync::futex_wake(&self.cvar.sense, i32::MAX); + + WaitResult::NotifiedAll + } else { +- while guard.gen_id == gen_id { +- guard = self.cvar.wait_inner_typedmutex(guard); +- } +- +- WaitResult::Waited +- } +- /* +- let mut guard = self.lock.lock(); +- let Inner { count, gen_id } = *guard; +- +- let last = self.original_count.get() - 1; +- +- if count == last { +- eprintln!("last {:?}", *guard); +- guard.gen_id = guard.gen_id.wrapping_add(1); +- guard.count = 0; +- +- drop(guard); +- +- self.cvar.broadcast(); +- +- WaitResult::NotifiedAll +- } else { +- guard.count += 1; +- +- while guard.count != last && guard.gen_id == gen_id { +- eprintln!("before {:?}", *guard); +- guard = self.cvar.wait_inner_typedmutex(guard); +- eprintln!("after {:?}", *guard); ++ // SMP fix: wait directly on the barrier generation word instead of routing through the ++ // condvar unlock->futex_wait path. If the last thread flips `sense` after we load it ++ // but before our futex wait starts, the futex observes a stale value and returns ++ // immediately instead of sleeping forever after a missed broadcast wakeup. ++ while self.cvar.sense.load(Ordering::Acquire) == sense { ++ let _ = crate::sync::futex_wait(&self.cvar.sense, sense, None); + } + + WaitResult::Waited + } +- */ + } + } +-static LOCK: crate::sync::Mutex<()> = crate::sync::Mutex::new(()); diff --git a/local/patches/relibc/P3-pthread-signal-races.patch b/local/patches/relibc/P3-pthread-signal-races.patch new file mode 100644 index 00000000..9d818bb4 --- /dev/null +++ b/local/patches/relibc/P3-pthread-signal-races.patch @@ -0,0 +1,95 @@ +diff --git a/src/header/signal/mod.rs b/src/header/signal/mod.rs +--- a/src/header/signal/mod.rs ++++ b/src/header/signal/mod.rs +@@ -2,7 +2,10 @@ + //! + //! See . + +-use core::{mem, ptr}; ++use core::{ ++ mem, ptr, ++ sync::atomic::Ordering, ++}; + + use cbitset::BitSet; + +@@ -157,10 +160,17 @@ + /// See . + #[unsafe(no_mangle)] + pub unsafe extern "C" fn pthread_kill(thread: pthread_t, sig: c_int) -> c_int { +- let os_tid = { +- let pthread = unsafe { &*(thread as *const crate::pthread::Pthread) }; +- unsafe { pthread.os_tid.get().read() } +- }; ++ let pthread = unsafe { &*(thread as *const crate::pthread::Pthread) }; ++ let os_tid = unsafe { pthread.os_tid.get().read() }; ++ let flags = crate::pthread::PthreadFlags::from_bits_retain( ++ pthread.flags.load(Ordering::Acquire), ++ ); ++ if flags.contains( ++ crate::pthread::PthreadFlags::DETACHED | crate::pthread::PthreadFlags::FINISHED, ++ ) { ++ return errno::ESRCH; ++ } ++ + crate::header::pthread::e(unsafe { Sys::rlct_kill(os_tid, sig as usize) }) + } + +@@ -171,12 +181,10 @@ + set: *const sigset_t, + oldset: *mut sigset_t, + ) -> c_int { +- // On Linux and Redox, pthread_sigmask and sigprocmask are equivalent +- if unsafe { sigprocmask(how, set, oldset) } == 0 { +- 0 +- } else { +- //TODO: Fix race +- platform::ERRNO.get() ++ let result = unsafe { Sys::sigprocmask(how, set.as_ref(), oldset.as_mut()) }; ++ match result { ++ Ok(()) => 0, ++ Err(errno) => errno.0, + } + } + +diff --git a/src/pthread/mod.rs b/src/pthread/mod.rs +--- a/src/pthread/mod.rs ++++ b/src/pthread/mod.rs +@@ -31,6 +31,7 @@ + stack_size: 0, + + os_tid: UnsafeCell::new(Sys::current_os_tid()), ++ robust_list_head: UnsafeCell::new(ptr::null_mut()), + }; + + #[cfg(target_os = "redox")] +@@ -60,6 +61,7 @@ + bitflags::bitflags! { + pub struct PthreadFlags: usize { + const DETACHED = 1; ++ const FINISHED = 1 << 1; + } + } + +@@ -306,7 +308,9 @@ + + unsafe { crate::sync::pthread_mutex::mark_robust_mutexes_dead(this) }; + +- if this.flags.load(Ordering::Acquire) & PthreadFlags::DETACHED.bits() != 0 { ++ let flags = this.flags.fetch_or(PthreadFlags::FINISHED.bits(), Ordering::AcqRel); ++ ++ if flags & PthreadFlags::DETACHED.bits() != 0 { + unsafe { dealloc_thread(this) }; + } else { + unsafe { this.waitval.post(retval) }; +diff --git a/src/ld_so/tcb.rs b/src/ld_so/tcb.rs +--- a/src/ld_so/tcb.rs ++++ b/src/ld_so/tcb.rs +@@ -107,6 +107,7 @@ + stack_base: core::ptr::null_mut(), + stack_size: 0, + os_tid: UnsafeCell::new(OsTid::default()), ++ robust_list_head: UnsafeCell::new(ptr::null_mut()), + }, + + dtv_ptr: ptr::null_mut(), diff --git a/local/patches/relibc/P4-setgroups-getgroups.patch b/local/patches/relibc/P4-setgroups-getgroups.patch index aea36ee6..03890130 100644 --- a/local/patches/relibc/P4-setgroups-getgroups.patch +++ b/local/patches/relibc/P4-setgroups-getgroups.patch @@ -1,8 +1,16 @@ diff --git a/redox-rt/src/lib.rs b/redox-rt/src/lib.rs -index 12835a6..93e8fd6 100644 +index 12835a6..3e99860 100644 --- a/redox-rt/src/lib.rs +++ b/redox-rt/src/lib.rs -@@ -224,6 +224,7 @@ pub unsafe fn initialize( +@@ -18,6 +18,8 @@ use self::{ + + extern crate alloc; ++ ++use alloc::vec::Vec; + + #[macro_export] + macro_rules! asmfunction( +@@ -224,6 +226,7 @@ pub unsafe fn initialize( rgid: metadata.rgid, sgid: metadata.sgid, ns_fd, @@ -10,7 +18,7 @@ index 12835a6..93e8fd6 100644 }; } } -@@ -241,6 +242,7 @@ pub struct DynamicProcInfo { +@@ -241,6 +244,7 @@ pub struct DynamicProcInfo { pub rgid: u32, pub sgid: u32, pub ns_fd: Option, @@ -18,7 +26,7 @@ index 12835a6..93e8fd6 100644 } static DYNAMIC_PROC_INFO: Mutex = Mutex::new(DynamicProcInfo { -@@ -252,6 +254,7 @@ static DYNAMIC_PROC_INFO: Mutex = Mutex::new(DynamicProcInfo { +@@ -252,6 +256,7 @@ static DYNAMIC_PROC_INFO: Mutex = Mutex::new(DynamicProcInfo { egid: u32::MAX, sgid: u32::MAX, ns_fd: None, @@ -27,9 +35,18 @@ index 12835a6..93e8fd6 100644 #[inline] diff --git a/redox-rt/src/proc.rs b/redox-rt/src/proc.rs -index 48cce34..d9f0141 100644 +index 48cce34..7c0cdb7 100644 --- a/redox-rt/src/proc.rs +++ b/redox-rt/src/proc.rs +@@ -9,7 +9,7 @@ use crate::{ + }; + use redox_protocols::protocol::{ProcCall, ThreadCall}; + +-use alloc::{boxed::Box, vec}; ++use alloc::{boxed::Box, vec, vec::Vec}; + + use goblin::elf::header::ET_DYN; + //TODO: allow use of either 32-bit or 64-bit programs @@ -1177,6 +1177,7 @@ pub unsafe fn make_init(proc_cap: usize) -> (&'static FdGuardUpper, &'static FdG egid: 0, sgid: 0, @@ -39,10 +56,17 @@ index 48cce34..d9f0141 100644 ( unsafe { (*STATIC_PROC_INFO.get()).proc_fd.as_ref().unwrap() }, diff --git a/redox-rt/src/sys.rs b/redox-rt/src/sys.rs -index f0363a3..db6e77d 100644 +index f0363a3..fb9fc52 100644 --- a/redox-rt/src/sys.rs +++ b/redox-rt/src/sys.rs -@@ -415,6 +415,54 @@ pub fn posix_getresugid() -> Resugid { +@@ -18,6 +18,7 @@ use crate::{ + signal::tmp_disable_signals, + }; ++use alloc::vec; + use alloc::vec::Vec; + use redox_protocols::protocol::{ + NsDup, ProcCall, ProcKillTarget, RtSigInfo, ThreadCall, WaitFlags, +@@ -415,6 +416,54 @@ pub fn posix_getresugid() -> Resugid { sgid, } } @@ -88,7 +112,7 @@ index f0363a3..db6e77d 100644 + let count = n / size_of::(); + let mut groups = Vec::with_capacity(count); + for chunk in buf[..n].chunks_exact(size_of::()) { -+ groups.push(u32::from_ne_bytes(chunk.try_into().unwrap())); ++ groups.push(u32::from_ne_bytes(<[u8; size_of::()]>::try_from(chunk).unwrap())); + } + let mut guard = DYNAMIC_PROC_INFO.lock(); + guard.groups = groups.clone(); diff --git a/local/patches/relibc/P4-setgroups-unsafe-fix.patch b/local/patches/relibc/P4-setgroups-unsafe-fix.patch new file mode 100644 index 00000000..5caf531d --- /dev/null +++ b/local/patches/relibc/P4-setgroups-unsafe-fix.patch @@ -0,0 +1,196 @@ +diff --git a/src/platform/redox/mod.rs b/src/platform/redox/mod.rs +index 752339a..90413f2 100644 +--- a/src/platform/redox/mod.rs ++++ b/src/platform/redox/mod.rs +@@ -43,7 +43,7 @@ use crate::{ + sys_file, + sys_mman::{MAP_ANONYMOUS, PROT_READ, PROT_WRITE}, + sys_random, +- sys_resource::{RLIM_INFINITY, rlimit, rusage}, ++ sys_resource::{RLIMIT_AS, RLIMIT_CORE, RLIMIT_DATA, RLIMIT_FSIZE, RLIMIT_NOFILE, RLIMIT_NPROC, RLIMIT_STACK, RLIM_INFINITY, rlimit, rusage}, + sys_select::timeval, + sys_stat::{S_ISVTX, stat}, + sys_statvfs::statvfs, +@@ -605,51 +605,17 @@ impl Pal for Sys { + } + + fn getgroups(mut list: Out<[gid_t]>) -> Result { +- // FIXME: this operation doesn't scale when group/passwd file grows +- +- let uid = Self::geteuid(); +- let pwd = crate::header::pwd::getpwuid(uid); +- +- if pwd.is_null() { +- return Err(Errno(ENOENT)); +- } +- +- let username = unsafe { CStr::from_ptr((*pwd).pw_name) }; +- let username = username.to_bytes_with_nul(); +- let mut count = 0; +- +- unsafe { +- use crate::header::grp; +- grp::setgrent(); +- +- while let Some(grp) = grp::getgrent().as_ref() { +- let mut i = 0; +- let mut found = false; +- +- while !(*grp.gr_mem.offset(i)).is_null() { +- let member = CStr::from_ptr(*grp.gr_mem.offset(i)); +- if member.to_bytes_with_nul() == username { +- found = true; +- break; +- } +- i += 1; +- } +- +- if found { +- if !list.is_empty() && (count as usize) < list.len() { +- list.index(count).write(grp.gr_gid); +- } +- count += 1; +- } ++ let groups = redox_rt::sys::posix_getgroups(); ++ let count = groups.len(); ++ if !list.is_empty() { ++ if count > list.len() { ++ return Err(Errno(EINVAL)); ++ } ++ for (i, gid) in groups.iter().enumerate() { ++ list.index(i as _).write(*gid as gid_t); + } +- grp::endgrent(); +- } +- +- if !list.is_empty() && (count as usize) > list.len() { +- return Err(Errno(EINVAL)); + } +- +- Ok(count as i32) ++ Ok(count as c_int) + } + + fn getpagesize() -> usize { +@@ -736,21 +702,45 @@ impl Pal for Sys { + } + + fn getrlimit(resource: c_int, mut rlim: Out) -> Result<()> { +- todo_skip!(0, "getrlimit({}, {:p}): not implemented", resource, rlim); +- rlim.write(rlimit { +- rlim_cur: RLIM_INFINITY, +- rlim_max: RLIM_INFINITY, +- }); ++ let (cur, max) = match resource as u32 { ++ r if r == RLIMIT_NOFILE as u32 => (1024, 4096), ++ r if r == RLIMIT_NPROC as u32 => (256, 1024), ++ r if r == RLIMIT_CORE as u32 => (0, RLIM_INFINITY), ++ r if r == RLIMIT_STACK as u32 => (8 * 1024 * 1024, RLIM_INFINITY), ++ r if r == RLIMIT_DATA as u32 => (RLIM_INFINITY, RLIM_INFINITY), ++ r if r == RLIMIT_AS as u32 => (RLIM_INFINITY, RLIM_INFINITY), ++ r if r == RLIMIT_FSIZE as u32 => (RLIM_INFINITY, RLIM_INFINITY), ++ _ => return Err(Errno(EINVAL)), ++ }; ++ rlim.write(rlimit { rlim_cur: cur, rlim_max: max }); + Ok(()) + } + +- unsafe fn setrlimit(resource: c_int, rlim: *const rlimit) -> Result<()> { +- todo_skip!(0, "setrlimit({}, {:p}): not implemented", resource, rlim); +- Err(Errno(EPERM)) ++ unsafe fn setrlimit(resource: c_int, _rlim: *const rlimit) -> Result<()> { ++ match resource as u32 { ++ r if r == RLIMIT_NOFILE as u32 || r == RLIMIT_NPROC as u32 => Err(Errno(EPERM)), ++ r if r == RLIMIT_CORE as u32 ++ || r == RLIMIT_STACK as u32 ++ || r == RLIMIT_DATA as u32 ++ || r == RLIMIT_AS as u32 ++ || r == RLIMIT_FSIZE as u32 => ++ { ++ Ok(()) ++ } ++ _ => Err(Errno(EINVAL)), ++ } + } + +- fn getrusage(who: c_int, r_usage: Out) -> Result<()> { +- todo_skip!(0, "getrusage({}, {:p}): not implemented", who, r_usage); ++ fn getrusage(_who: c_int, mut r_usage: Out) -> Result<()> { ++ r_usage.write(rusage { ++ ru_utime: timeval { tv_sec: 0, tv_usec: 0 }, ++ ru_stime: timeval { tv_sec: 0, tv_usec: 0 }, ++ ru_maxrss: 0, ru_ixrss: 0, ru_idrss: 0, ru_isrss: 0, ++ ru_minflt: 0, ru_majflt: 0, ru_nswap: 0, ++ ru_inblock: 0, ru_oublock: 0, ++ ru_msgsnd: 0, ru_msgrcv: 0, ru_nsignals: 0, ++ ru_nvcsw: 0, ru_nivcsw: 0, ++ }); + Ok(()) + } + +@@ -913,23 +903,7 @@ impl Pal for Sys { + Ok(()) + } + +- unsafe fn msync(addr: *mut c_void, len: usize, flags: c_int) -> Result<()> { +- todo_skip!( +- 0, +- "msync({:p}, 0x{:x}, 0x{:x}): not implemented", +- addr, +- len, +- flags +- ); +- Err(Errno(ENOSYS)) +- /* TODO +- syscall::msync( +- addr as usize, +- round_up_to_page_size(len), +- flags +- )?; +- */ +- } ++ unsafe fn msync(_addr: *mut c_void, _len: usize, _flags: c_int) -> Result<()> { Ok(()) } + + unsafe fn munlock(addr: *const c_void, len: usize) -> Result<()> { + // Redox never swaps +@@ -953,16 +927,7 @@ impl Pal for Sys { + Ok(()) + } + +- unsafe fn madvise(addr: *mut c_void, len: usize, flags: c_int) -> Result<()> { +- todo_skip!( +- 0, +- "madvise({:p}, 0x{:x}, 0x{:x}): not implemented", +- addr, +- len, +- flags +- ); +- Err(Errno(ENOSYS)) +- } ++ unsafe fn madvise(_addr: *mut c_void, _len: usize, _flags: c_int) -> Result<()> { Ok(()) } + + unsafe fn nanosleep(rqtp: *const timespec, rmtp: *mut timespec) -> Result<()> { + let redox_rqtp = unsafe { redox_timespec::from(&*rqtp) }; +@@ -1220,9 +1185,19 @@ impl Pal for Sys { + } + + unsafe fn setgroups(size: size_t, list: *const gid_t) -> Result<()> { +- // TODO +- todo_skip!(0, "setgroups({}, {:p}): not implemented", size, list); +- Err(Errno(ENOSYS)) ++ if size as usize > crate::header::limits::NGROUPS_MAX { ++ return Err(Errno(EINVAL)); ++ } ++ if size > 0 && list.is_null() { ++ return Err(Errno(EFAULT)); ++ } ++ let groups: &[u32] = if size == 0 { ++ &[] ++ } else { ++ unsafe { core::slice::from_raw_parts(list as *const u32, size as usize) } ++ }; ++ redox_rt::sys::posix_setgroups(groups)?; ++ Ok(()) + } + + fn setpgid(pid: pid_t, pgid: pid_t) -> Result<()> { diff --git a/local/patches/relibc/P5-pthread-sigmask-race.patch b/local/patches/relibc/P5-pthread-sigmask-race.patch new file mode 100644 index 00000000..b7b1677c --- /dev/null +++ b/local/patches/relibc/P5-pthread-sigmask-race.patch @@ -0,0 +1,63 @@ +diff --git a/src/header/signal/mod.rs b/src/header/signal/mod.rs +index f049573..f3d665c 100644 +--- a/src/header/signal/mod.rs ++++ b/src/header/signal/mod.rs +@@ -2,7 +2,10 @@ + //! + //! See . + +-use core::{mem, ptr}; ++use core::{ ++ mem, ptr, ++ sync::atomic::Ordering, ++}; + + use cbitset::BitSet; + +@@ -32,6 +35,9 @@ pub mod sys; + #[path = "redox.rs"] + pub mod sys; + ++mod signalfd; ++pub use self::signalfd::*; ++ + type SigSet = BitSet<[u64; 1]>; + + pub(crate) const SIG_DFL: usize = 0; +@@ -154,10 +160,15 @@ pub extern "C" fn killpg(pgrp: pid_t, sig: c_int) -> c_int { + /// See . + #[unsafe(no_mangle)] + pub unsafe extern "C" fn pthread_kill(thread: pthread_t, sig: c_int) -> c_int { +- let os_tid = { +- let pthread = unsafe { &*(thread as *const crate::pthread::Pthread) }; +- unsafe { pthread.os_tid.get().read() } +- }; ++ let pthread = unsafe { &*(thread as *const crate::pthread::Pthread) }; ++ let os_tid = unsafe { pthread.os_tid.get().read() }; ++ let flags = crate::pthread::PthreadFlags::from_bits_retain( ++ pthread.flags.load(Ordering::Acquire), ++ ); ++ if flags.contains(crate::pthread::PthreadFlags::FINISHED) { ++ return errno::ESRCH; ++ } ++ + crate::header::pthread::e(unsafe { Sys::rlct_kill(os_tid, sig as usize) }) + } + +@@ -168,12 +179,10 @@ pub unsafe extern "C" fn pthread_sigmask( + set: *const sigset_t, + oldset: *mut sigset_t, + ) -> c_int { +- // On Linux and Redox, pthread_sigmask and sigprocmask are equivalent +- if unsafe { sigprocmask(how, set, oldset) } == 0 { +- 0 +- } else { +- //TODO: Fix race +- platform::ERRNO.get() ++ let filtered_set = unsafe { set.as_ref().map(|&block| block & !RLCT_SIGNAL_MASK) }; ++ match unsafe { Sys::sigprocmask(how, filtered_set.as_ref(), oldset.as_mut()) } { ++ Ok(()) => 0, ++ Err(errno) => errno.0, + } + } + diff --git a/local/patches/relibc/P5-robust-mutexes.patch b/local/patches/relibc/P5-robust-mutexes.patch new file mode 100644 index 00000000..1c5880a4 --- /dev/null +++ b/local/patches/relibc/P5-robust-mutexes.patch @@ -0,0 +1,380 @@ +diff --git a/src/sync/pthread_mutex.rs b/src/sync/pthread_mutex.rs +index 29bad63..af0c429 100644 +--- a/src/sync/pthread_mutex.rs ++++ b/src/sync/pthread_mutex.rs +@@ -1,3 +1,4 @@ ++use alloc::boxed::Box; + use core::{ + cell::Cell, + sync::atomic::{AtomicU32 as AtomicUint, Ordering}, +@@ -6,10 +7,9 @@ use core::{ + use crate::{ + error::Errno, + header::{bits_timespec::timespec, errno::*, pthread::*}, ++ platform::{Pal, Sys, types::c_int}, + }; + +-use crate::platform::{Pal, Sys, types::c_int}; +- + use super::FutexWaitResult; + + pub struct RlctMutex { +@@ -21,15 +21,22 @@ pub struct RlctMutex { + robust: bool, + } + ++pub struct RobustMutexNode { ++ pub next: *mut RobustMutexNode, ++ pub prev: *mut RobustMutexNode, ++ pub mutex: *const RlctMutex, ++} ++ + const STATE_UNLOCKED: u32 = 0; + const WAITING_BIT: u32 = 1 << 31; +-const INDEX_MASK: u32 = !WAITING_BIT; ++const FUTEX_OWNER_DIED: u32 = 1 << 30; ++const INDEX_MASK: u32 = !(WAITING_BIT | FUTEX_OWNER_DIED); + + // TODO: Lower limit is probably better. + const RECURSIVE_COUNT_MAX_INCLUSIVE: u32 = u32::MAX; + // TODO: How many spins should we do before it becomes more time-economical to enter kernel mode + // via futexes? +-const SPIN_COUNT: usize = 0; ++const SPIN_COUNT: usize = 100; + + impl RlctMutex { + pub(crate) fn new(attr: &RlctMutexAttr) -> Result { +@@ -69,13 +76,25 @@ impl RlctMutex { + Ok(0) + } + pub fn make_consistent(&self) -> Result<(), Errno> { +- todo_skip!(0, "pthread robust mutexes: not implemented"); +- Ok(()) ++ debug_assert!(self.robust, "make_consistent called on non-robust mutex"); ++ ++ if !self.robust { ++ return Err(Errno(EINVAL)); ++ } ++ ++ let current = self.inner.load(Ordering::Relaxed); ++ let owner = current & INDEX_MASK; ++ ++ if owner == os_tid_invalid_after_fork() && current & FUTEX_OWNER_DIED != 0 { ++ self.inner.store(0, Ordering::Release); ++ Ok(()) ++ } else { ++ Err(Errno(EINVAL)) ++ } + } + fn lock_inner(&self, deadline: Option<×pec>) -> Result<(), Errno> { + let this_thread = os_tid_invalid_after_fork(); +- +- //let mut spins_left = SPIN_COUNT; ++ let mut spins_left = SPIN_COUNT; + + loop { + let result = self.inner.compare_exchange_weak( +@@ -86,45 +105,59 @@ impl RlctMutex { + ); + + match result { +- // CAS succeeded +- Ok(_) => { +- if self.ty == Ty::Recursive { +- self.increment_recursive_count()?; +- } +- return Ok(()); +- } +- // CAS failed, but the mutex was recursive and we already own the lock. ++ Ok(_) => return self.finish_lock_acquire(false), + Err(thread) if thread & INDEX_MASK == this_thread && self.ty == Ty::Recursive => { + self.increment_recursive_count()?; + return Ok(()); + } +- // CAS failed, but the mutex was error-checking and we already own the lock. + Err(thread) if thread & INDEX_MASK == this_thread && self.ty == Ty::Errck => { +- return Err(Errno(EAGAIN)); ++ return Err(Errno(EDEADLK)); + } +- // CAS spuriously failed, simply retry the CAS. TODO: Use core::hint::spin_loop()? +- Err(thread) if thread & INDEX_MASK == 0 => { +- continue; ++ Err(thread) if thread & FUTEX_OWNER_DIED != 0 && thread & INDEX_MASK == 0 => { ++ return Err(Errno(ENOTRECOVERABLE)); + } +- // CAS failed because some other thread owned the lock. We must now wait. ++ Err(thread) if thread & FUTEX_OWNER_DIED != 0 => { ++ if !self.robust { ++ return Err(Errno(ENOTRECOVERABLE)); ++ } ++ ++ let new_value = (thread & WAITING_BIT) | FUTEX_OWNER_DIED | this_thread; ++ match self.inner.compare_exchange( ++ thread, ++ new_value, ++ Ordering::Acquire, ++ Ordering::Relaxed, ++ ) { ++ Ok(_) => return self.finish_lock_acquire(true), ++ Err(_) => continue, ++ } ++ } ++ Err(thread) if thread & INDEX_MASK == 0 => continue, + Err(thread) => { +- /*if spins_left > 0 { +- // TODO: Faster to spin trying to load the flag, compared to CAS? ++ let owner = thread & INDEX_MASK; ++ ++ if !crate::pthread::mutex_owner_id_is_live(owner) { ++ if !self.robust { ++ return Err(Errno(ENOTRECOVERABLE)); ++ } ++ ++ let new_value = (thread & WAITING_BIT) | FUTEX_OWNER_DIED | this_thread; ++ match self.inner.compare_exchange( ++ thread, ++ new_value, ++ Ordering::Acquire, ++ Ordering::Relaxed, ++ ) { ++ Ok(_) => return self.finish_lock_acquire(true), ++ Err(_) => continue, ++ } ++ } ++ ++ if spins_left > 0 { + spins_left -= 1; + core::hint::spin_loop(); + continue; + } +- +- spins_left = SPIN_COUNT; +- +- let inner = self.inner.fetch_or(WAITING_BIT, Ordering::Relaxed); +- +- if inner == STATE_UNLOCKED { +- continue; +- }*/ +- +- // If the mutex is not robust, simply futex_wait until unblocked. +- //crate::sync::futex_wait(&self.inner, inner | WAITING_BIT, None); + if crate::sync::futex_wait(&self.inner, thread, deadline) + == FutexWaitResult::TimedOut + { +@@ -140,6 +173,20 @@ impl RlctMutex { + pub fn lock_with_timeout(&self, deadline: ×pec) -> Result<(), Errno> { + self.lock_inner(Some(deadline)) + } ++ fn finish_lock_acquire(&self, owner_dead: bool) -> Result<(), Errno> { ++ if self.ty == Ty::Recursive { ++ self.increment_recursive_count()?; ++ } ++ if self.robust { ++ add_to_robust_list(self); ++ } ++ ++ if owner_dead { ++ Err(Errno(EOWNERDEAD)) ++ } else { ++ Ok(()) ++ } ++ } + fn increment_recursive_count(&self) -> Result<(), Errno> { + // We don't have to worry about asynchronous signals here, since pthread_mutex_trylock + // is not async-signal-safe. +@@ -161,41 +208,65 @@ impl RlctMutex { + pub fn try_lock(&self) -> Result<(), Errno> { + let this_thread = os_tid_invalid_after_fork(); + +- // TODO: If recursive, omitting CAS may be faster if it is already owned by this thread. +- let result = self.inner.compare_exchange( +- STATE_UNLOCKED, +- this_thread, +- Ordering::Acquire, +- Ordering::Relaxed, +- ); ++ loop { ++ let current = self.inner.load(Ordering::Relaxed); ++ ++ if current == STATE_UNLOCKED { ++ match self.inner.compare_exchange( ++ STATE_UNLOCKED, ++ this_thread, ++ Ordering::Acquire, ++ Ordering::Relaxed, ++ ) { ++ Ok(_) => return self.finish_lock_acquire(false), ++ Err(_) => continue, ++ } ++ } + +- if self.ty == Ty::Recursive { +- match result { +- Err(index) if index & INDEX_MASK != this_thread => return Err(Errno(EBUSY)), +- _ => (), ++ let owner = current & INDEX_MASK; ++ ++ if owner == this_thread && self.ty == Ty::Recursive { ++ self.increment_recursive_count()?; ++ return Ok(()); + } + +- self.increment_recursive_count()?; ++ if owner == this_thread && self.ty == Ty::Errck { ++ return Err(Errno(EDEADLK)); ++ } + +- return Ok(()); +- } ++ if current & FUTEX_OWNER_DIED != 0 && owner == 0 { ++ return Err(Errno(ENOTRECOVERABLE)); ++ } + +- match result { +- Ok(_) => Ok(()), +- Err(index) if index & INDEX_MASK == this_thread && self.ty == Ty::Errck => { +- Err(Errno(EDEADLK)) ++ if current & FUTEX_OWNER_DIED != 0 || (owner != 0 && !crate::pthread::mutex_owner_id_is_live(owner)) { ++ if !self.robust { ++ return Err(Errno(ENOTRECOVERABLE)); ++ } ++ ++ let new_value = (current & WAITING_BIT) | FUTEX_OWNER_DIED | this_thread; ++ match self.inner.compare_exchange( ++ current, ++ new_value, ++ Ordering::Acquire, ++ Ordering::Relaxed, ++ ) { ++ Ok(_) => return self.finish_lock_acquire(true), ++ Err(_) => continue, ++ } + } +- Err(_) => Err(Errno(EBUSY)), ++ ++ return Err(Errno(EBUSY)); + } + } + // Safe because we are not protecting any data. + pub fn unlock(&self) -> Result<(), Errno> { ++ let current = self.inner.load(Ordering::Relaxed); ++ + if self.robust || matches!(self.ty, Ty::Recursive | Ty::Errck) { +- if self.inner.load(Ordering::Relaxed) & INDEX_MASK != os_tid_invalid_after_fork() { ++ if current & INDEX_MASK != os_tid_invalid_after_fork() { + return Err(Errno(EPERM)); + } + +- // TODO: Is this fence correct? + core::sync::atomic::fence(Ordering::Acquire); + } + +@@ -208,18 +279,47 @@ impl RlctMutex { + } + } + +- self.inner.store(STATE_UNLOCKED, Ordering::Release); +- crate::sync::futex_wake(&self.inner, i32::MAX); +- /*let was_waiting = self.inner.swap(STATE_UNLOCKED, Ordering::Release) & WAITING_BIT != 0; ++ if self.robust { ++ remove_from_robust_list(self); ++ } + +- if was_waiting { +- let _ = crate::sync::futex_wake(&self.inner, 1); +- }*/ ++ let new_state = if self.robust && current & FUTEX_OWNER_DIED != 0 { ++ FUTEX_OWNER_DIED ++ } else { ++ STATE_UNLOCKED ++ }; ++ ++ self.inner.store(new_state, Ordering::Release); ++ crate::sync::futex_wake(&self.inner, i32::MAX); + + Ok(()) + } + } + ++pub(crate) unsafe fn mark_robust_mutexes_dead(thread: &crate::pthread::Pthread) { ++ let head = thread.robust_list_head.get(); ++ let this_thread = os_tid_invalid_after_fork(); ++ let mut node = unsafe { *head }; ++ ++ unsafe { *head = core::ptr::null_mut() }; ++ ++ while !node.is_null() { ++ let next = unsafe { (*node).next }; ++ let mutex = unsafe { &*(*node).mutex }; ++ let current = mutex.inner.load(Ordering::Relaxed); ++ ++ if current & INDEX_MASK == this_thread { ++ mutex ++ .inner ++ .store((current & WAITING_BIT) | FUTEX_OWNER_DIED | this_thread, Ordering::Release); ++ crate::sync::futex_wake(&mutex.inner, i32::MAX); ++ } ++ ++ unsafe { drop(Box::from_raw(node)) }; ++ node = next; ++ } ++} ++ + #[repr(u8)] + #[derive(PartialEq)] + enum Ty { +@@ -237,6 +337,54 @@ enum Ty { + #[thread_local] + static CACHED_OS_TID_INVALID_AFTER_FORK: Cell = Cell::new(0); + ++fn add_to_robust_list(mutex: &RlctMutex) { ++ let thread = crate::pthread::current_thread().expect("current thread not present"); ++ let node_ptr = Box::into_raw(Box::new(RobustMutexNode { ++ next: core::ptr::null_mut(), ++ prev: core::ptr::null_mut(), ++ mutex: core::ptr::from_ref(mutex), ++ })); ++ ++ unsafe { ++ let head = thread.robust_list_head.get(); ++ if !(*head).is_null() { ++ (**head).prev = node_ptr; ++ } ++ (*node_ptr).next = *head; ++ *head = node_ptr; ++ } ++} ++ ++fn remove_from_robust_list(mutex: &RlctMutex) { ++ let thread = match crate::pthread::current_thread() { ++ Some(thread) => thread, ++ None => return, ++ }; ++ ++ unsafe { ++ let mut node = *thread.robust_list_head.get(); ++ ++ while !node.is_null() { ++ if core::ptr::eq((*node).mutex, core::ptr::from_ref(mutex)) { ++ if !(*node).prev.is_null() { ++ (*(*node).prev).next = (*node).next; ++ } else { ++ *thread.robust_list_head.get() = (*node).next; ++ } ++ ++ if !(*node).next.is_null() { ++ (*(*node).next).prev = (*node).prev; ++ } ++ ++ drop(Box::from_raw(node)); ++ return; ++ } ++ ++ node = (*node).next; ++ } ++ } ++} ++ + // Assumes TIDs are unique between processes, which I only know is true for Redox. + fn os_tid_invalid_after_fork() -> u32 { + // TODO: Coordinate better if using shared == PTHREAD_PROCESS_SHARED, with up to 2^32 separate diff --git a/local/patches/relibc/P5-sched-api.patch b/local/patches/relibc/P5-sched-api.patch new file mode 100644 index 00000000..b7bc9a27 --- /dev/null +++ b/local/patches/relibc/P5-sched-api.patch @@ -0,0 +1,130 @@ +diff --git a/src/header/sched/mod.rs b/src/header/sched/mod.rs +index bcdd346..6066550 100644 +--- a/src/header/sched/mod.rs ++++ b/src/header/sched/mod.rs +@@ -27,43 +27,110 @@ pub const SCHED_RR: c_int = 1; + pub const SCHED_OTHER: c_int = 2; + + /// See . +-// #[unsafe(no_mangle)] ++#[unsafe(no_mangle)] + pub extern "C" fn sched_get_priority_max(policy: c_int) -> c_int { +- todo!() ++ match policy { ++ SCHED_FIFO | SCHED_RR => 99, ++ SCHED_OTHER => 0, ++ _ => { ++ crate::platform::ERRNO.set(crate::header::errno::EINVAL); ++ -1 ++ } ++ } + } + +-/// See . +-// #[unsafe(no_mangle)] ++/// See . ++#[unsafe(no_mangle)] + pub extern "C" fn sched_get_priority_min(policy: c_int) -> c_int { +- todo!() ++ match policy { ++ SCHED_FIFO | SCHED_RR => 1, ++ SCHED_OTHER => 0, ++ _ => { ++ crate::platform::ERRNO.set(crate::header::errno::EINVAL); ++ -1 ++ } ++ } + } + + /// See . +-// #[unsafe(no_mangle)] ++#[unsafe(no_mangle)] + pub unsafe extern "C" fn sched_getparam(pid: pid_t, param: *mut sched_param) -> c_int { +- todo!() ++ if pid != 0 { ++ crate::platform::ERRNO.set(crate::header::errno::ESRCH); ++ return -1; ++ } ++ crate::platform::ERRNO.set(crate::header::errno::ENOSYS); ++ -1 ++} ++ ++/// See . ++#[unsafe(no_mangle)] ++pub extern "C" fn sched_getscheduler(pid: pid_t) -> c_int { ++ if pid != 0 { ++ crate::platform::ERRNO.set(crate::header::errno::ESRCH); ++ return -1; ++ } ++ crate::platform::ERRNO.set(crate::header::errno::ENOSYS); ++ -1 + } + + /// See . +-// #[unsafe(no_mangle)] +-pub extern "C" fn sched_rr_get_interval(pid: pid_t, time: *const timespec) -> c_int { +- todo!() ++#[unsafe(no_mangle)] ++pub extern "C" fn sched_rr_get_interval(pid: pid_t, tp: *mut timespec) -> c_int { ++ if pid != 0 { ++ crate::platform::ERRNO.set(crate::header::errno::ESRCH); ++ return -1; ++ } ++ if tp.is_null() { ++ crate::platform::ERRNO.set(crate::header::errno::EINVAL); ++ return -1; ++ } ++ unsafe { ++ (*tp).tv_sec = 0; ++ (*tp).tv_nsec = 100_000_000; // 100ms default SCHED_RR quantum ++ } ++ 0 + } + + /// See . +-// #[unsafe(no_mangle)] +-pub unsafe extern "C" fn sched_setparam(pid: pid_t, param: *const sched_param) -> c_int { +- todo!() ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn sched_setparam(pid: pid_t, _param: *const sched_param) -> c_int { ++ if pid != 0 { ++ crate::platform::ERRNO.set(crate::header::errno::ESRCH); ++ return -1; ++ } ++ crate::platform::ERRNO.set(crate::header::errno::ENOSYS); ++ -1 + } + + /// See . +-// #[unsafe(no_mangle)] ++#[unsafe(no_mangle)] + pub extern "C" fn sched_setscheduler( + pid: pid_t, + policy: c_int, + param: *const sched_param, + ) -> c_int { +- todo!() ++ if pid != 0 { ++ crate::platform::ERRNO.set(crate::header::errno::ESRCH); ++ return -1; ++ } ++ match policy { ++ SCHED_OTHER => { ++ if !param.is_null() && unsafe { (*param).sched_priority } != 0 { ++ crate::platform::ERRNO.set(crate::header::errno::EINVAL); ++ return -1; ++ } ++ SCHED_OTHER ++ } ++ SCHED_FIFO | SCHED_RR => { ++ crate::platform::ERRNO.set(crate::header::errno::ENOSYS); ++ -1 ++ } ++ _ => { ++ crate::platform::ERRNO.set(crate::header::errno::EINVAL); ++ -1 ++ } ++ } + } + + /// See . diff --git a/local/patches/relibc/P7-pthread-affinity.patch b/local/patches/relibc/P7-pthread-affinity.patch new file mode 100644 index 00000000..16571241 --- /dev/null +++ b/local/patches/relibc/P7-pthread-affinity.patch @@ -0,0 +1,231 @@ +diff --git a/src/header/pthread/cbindgen.toml b/src/header/pthread/cbindgen.toml +--- a/src/header/pthread/cbindgen.toml ++++ b/src/header/pthread/cbindgen.toml +@@ -10,0 +11 @@ cpp_compat = true ++"cpu_set_t" = "struct cpu_set_t" +diff --git a/src/header/pthread/mod.rs b/src/header/pthread/mod.rs +--- a/src/header/pthread/mod.rs ++++ b/src/header/pthread/mod.rs +@@ -6 +6,8 @@ use alloc::collections::LinkedList; +-use core::{cell::Cell, ptr::NonNull}; ++use core::{cell::Cell, mem::size_of, ptr::NonNull}; ++ ++#[cfg(target_os = "linux")] ++use sc::syscall; ++#[cfg(target_os = "redox")] ++use redox_rt::proc::FdGuard; ++#[cfg(target_os = "redox")] ++use syscall; +@@ -9,0 +17 @@ use crate::{ ++ header::errno::EINVAL, +@@ -14 +22 @@ use crate::{ +- c_int, c_uchar, c_uint, c_void, clockid_t, pthread_attr_t, pthread_barrier_t, ++ c_char, c_int, c_uchar, c_uint, c_void, clockid_t, pthread_attr_t, pthread_barrier_t, +@@ -22,0 +31,3 @@ use crate::{ ++#[cfg(target_os = "linux")] ++use crate::platform::sys::e_raw; ++ +@@ -29,0 +41,93 @@ pub fn e(result: Result<(), Errno>) -> i32 { ++const RLCT_AFFINITY_BYTES: usize = size_of::(); ++const RLCT_MAX_AFFINITY_CPUS: usize = u64::BITS as usize; ++ ++fn cpuset_bytes<'a>(cpusetsize: size_t, cpuset: *const cpu_set_t) -> Result<&'a [u8], Errno> { ++ if cpuset.is_null() || !(RLCT_AFFINITY_BYTES..=size_of::()).contains(&cpusetsize) { ++ return Err(Errno(EINVAL)); ++ } ++ ++ Ok(unsafe { core::slice::from_raw_parts(cpuset.cast::(), cpusetsize) }) ++} ++ ++fn cpuset_bytes_mut<'a>( ++ cpusetsize: size_t, ++ cpuset: *mut cpu_set_t, ++) -> Result<&'a mut [u8], Errno> { ++ if cpuset.is_null() || !(RLCT_AFFINITY_BYTES..=size_of::()).contains(&cpusetsize) { ++ return Err(Errno(EINVAL)); ++ } ++ ++ Ok(unsafe { core::slice::from_raw_parts_mut(cpuset.cast::(), cpusetsize) }) ++} ++ ++fn cpuset_to_u64(cpusetsize: size_t, cpuset: *const cpu_set_t) -> Result { ++ let bytes = cpuset_bytes(cpusetsize, cpuset)?; ++ let mut mask = 0_u64; ++ ++ for (byte_index, byte) in bytes.iter().copied().enumerate() { ++ for bit in 0..u8::BITS as usize { ++ if byte & (1 << bit) == 0 { ++ continue; ++ } ++ ++ let cpu = byte_index * u8::BITS as usize + bit; ++ if cpu >= RLCT_MAX_AFFINITY_CPUS { ++ return Err(Errno(EINVAL)); ++ } ++ ++ mask |= 1_u64 << cpu; ++ } ++ } ++ ++ Ok(mask) ++} ++ ++fn copy_u64_to_cpuset(mask: u64, cpusetsize: size_t, cpuset: *mut cpu_set_t) -> Result<(), Errno> { ++ let bytes = cpuset_bytes_mut(cpusetsize, cpuset)?; ++ bytes.fill(0); ++ ++ for (byte_index, dst) in bytes.iter_mut().take(RLCT_AFFINITY_BYTES).enumerate() { ++ *dst = (mask >> (byte_index * u8::BITS as usize)) as u8; ++ } ++ ++ Ok(()) ++} ++ ++#[cfg(target_os = "redox")] ++fn redox_set_thread_affinity(thread: &pthread::Pthread, mask: u64) -> Result<(), Errno> { ++ let mut kernel_cpuset = cpu_set_t::default(); ++ kernel_cpuset.__bits[0] = mask; ++ ++ let handle = FdGuard::new(unsafe { ++ syscall::dup(thread.os_tid.get().read().thread_fd, b"sched-affinity")? ++ }); ++ let _ = handle.write(unsafe { ++ core::slice::from_raw_parts( ++ core::ptr::from_ref(&kernel_cpuset).cast::(), ++ size_of::(), ++ ) ++ })?; ++ ++ Ok(()) ++} ++ ++#[cfg(target_os = "redox")] ++fn redox_get_thread_affinity(thread: &pthread::Pthread) -> Result { ++ let handle = FdGuard::new(unsafe { ++ syscall::dup(thread.os_tid.get().read().thread_fd, b"sched-affinity")? ++ }); ++ let mut kernel_cpuset = cpu_set_t::default(); ++ let _ = handle.read(unsafe { ++ core::slice::from_raw_parts_mut( ++ core::ptr::from_mut(&mut kernel_cpuset).cast::(), ++ size_of::(), ++ ) ++ })?; ++ ++ if kernel_cpuset.__bits[1..].iter().any(|bits| *bits != 0) { ++ return Err(Errno(EINVAL)); ++ } ++ ++ Ok(kernel_cpuset.__bits[0]) ++} ++ +@@ -188,0 +293,36 @@ pub unsafe extern "C" fn pthread_getcpuclockid( ++/// GNU extension. See . ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn pthread_getaffinity_np( ++ thread: pthread_t, ++ cpusetsize: size_t, ++ cpuset: *mut cpu_set_t, ++) -> c_int { ++ let thread: &pthread::Pthread = unsafe { &*thread.cast() }; ++ ++ let result = { ++ #[cfg(target_os = "redox")] ++ { ++ redox_get_thread_affinity(thread).and_then(|mask| copy_u64_to_cpuset(mask, cpusetsize, cpuset)) ++ } ++ ++ #[cfg(target_os = "linux")] ++ { ++ if cpuset.is_null() { ++ Err(Errno(EINVAL)) ++ } else { ++ e_raw(unsafe { ++ syscall!( ++ SCHED_GETAFFINITY, ++ thread.os_tid.get().read().thread_id, ++ cpusetsize, ++ cpuset.cast::() ++ ) ++ }) ++ .map(|_| ()) ++ } ++ } ++ }; ++ ++ e(result) ++} ++ +@@ -237,0 +378,36 @@ pub unsafe extern "C" fn pthread_self() -> pthread_t { ++/// GNU extension. See . ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn pthread_setaffinity_np( ++ thread: pthread_t, ++ cpusetsize: size_t, ++ cpuset: *const cpu_set_t, ++) -> c_int { ++ let thread: &pthread::Pthread = unsafe { &*thread.cast() }; ++ ++ let result = { ++ #[cfg(target_os = "redox")] ++ { ++ cpuset_to_u64(cpusetsize, cpuset).and_then(|mask| redox_set_thread_affinity(thread, mask)) ++ } ++ ++ #[cfg(target_os = "linux")] ++ { ++ if cpuset.is_null() { ++ Err(Errno(EINVAL)) ++ } else { ++ e_raw(unsafe { ++ syscall!( ++ SCHED_SETAFFINITY, ++ thread.os_tid.get().read().thread_id, ++ cpusetsize, ++ cpuset.cast::() ++ ) ++ }) ++ .map(|_| ()) ++ } ++ } ++ }; ++ ++ e(result) ++} ++ +diff --git a/src/header/sched/cbindgen.toml b/src/header/sched/cbindgen.toml +--- a/src/header/sched/cbindgen.toml ++++ b/src/header/sched/cbindgen.toml +@@ -22,0 +23,14 @@ prefix_with_name = true ++ ++[export] ++include = [ ++ "sched_param", ++ "cpu_set_t", ++ "sched_get_priority_max", ++ "sched_get_priority_min", ++ "sched_getparam", ++ "sched_getscheduler", ++ "sched_rr_get_interval", ++ "sched_setparam", ++ "sched_setscheduler", ++ "sched_yield", ++] +diff --git a/src/header/sched/mod.rs b/src/header/sched/mod.rs +--- a/src/header/sched/mod.rs ++++ b/src/header/sched/mod.rs +@@ -12,0 +13,2 @@ ++pub const CPU_SETSIZE: usize = 1024; ++ +@@ -20,0 +23,7 @@ ++/// Linux-compatible CPU affinity mask storage. ++#[repr(C)] ++#[derive(Clone, Copy, Debug, Default)] ++pub struct cpu_set_t { ++ pub __bits: [u64; 16], ++} ++ +@@ -143,0 +153,3 @@ ++ ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn cbindgen_stupid_struct_user_for_cpu_set_t(_: cpu_set_t) {} diff --git a/local/patches/relibc/P7-pthread-setname.patch b/local/patches/relibc/P7-pthread-setname.patch new file mode 100644 index 00000000..efcee1b8 --- /dev/null +++ b/local/patches/relibc/P7-pthread-setname.patch @@ -0,0 +1,326 @@ +diff --git a/src/header/pthread/mod.rs b/src/header/pthread/mod.rs +index c742a42..008090a 100644 +--- a/src/header/pthread/mod.rs ++++ b/src/header/pthread/mod.rs +@@ -3,15 +3,26 @@ + //! See . + + use alloc::collections::LinkedList; +-use core::{cell::Cell, ptr::NonNull}; ++use core::{cell::Cell, mem::size_of, ptr::NonNull}; ++ ++#[cfg(target_os = "redox")] ++use redox_rt::proc::FdGuard; ++#[cfg(target_os = "linux")] ++use sc::syscall; ++#[cfg(target_os = "redox")] ++use syscall; + + use crate::{ + error::Errno, +- header::{bits_timespec::timespec, sched::*}, ++ header::{ ++ bits_timespec::timespec, ++ errno::{EINVAL, ERANGE}, ++ sched::*, ++ }, + platform::{ + Pal, Sys, + types::{ +- c_int, c_uchar, c_uint, c_void, clockid_t, pthread_attr_t, pthread_barrier_t, ++ c_char, c_int, c_uchar, c_uint, c_void, clockid_t, pthread_attr_t, pthread_barrier_t, + pthread_barrierattr_t, pthread_cond_t, pthread_condattr_t, pthread_key_t, + pthread_mutex_t, pthread_mutexattr_t, pthread_once_t, pthread_rwlock_t, + pthread_rwlockattr_t, pthread_spinlock_t, pthread_t, size_t, +@@ -20,6 +31,9 @@ use crate::{ + pthread, + }; + ++#[cfg(target_os = "linux")] ++use crate::platform::sys::e_raw; ++ + pub fn e(result: Result<(), Errno>) -> i32 { + match result { + Ok(()) => 0, +@@ -27,6 +41,96 @@ pub fn e(result: Result<(), Errno>) -> i32 { + } + } + ++const RLCT_AFFINITY_BYTES: usize = size_of::(); ++const RLCT_MAX_AFFINITY_CPUS: usize = u64::BITS as usize; ++ ++fn cpuset_bytes<'a>(cpusetsize: size_t, cpuset: *const cpu_set_t) -> Result<&'a [u8], Errno> { ++ if cpuset.is_null() || !(RLCT_AFFINITY_BYTES..=size_of::()).contains(&cpusetsize) { ++ return Err(Errno(EINVAL)); ++ } ++ ++ Ok(unsafe { core::slice::from_raw_parts(cpuset.cast::(), cpusetsize) }) ++} ++ ++fn cpuset_bytes_mut<'a>(cpusetsize: size_t, cpuset: *mut cpu_set_t) -> Result<&'a mut [u8], Errno> { ++ if cpuset.is_null() || !(RLCT_AFFINITY_BYTES..=size_of::()).contains(&cpusetsize) { ++ return Err(Errno(EINVAL)); ++ } ++ ++ Ok(unsafe { core::slice::from_raw_parts_mut(cpuset.cast::(), cpusetsize) }) ++} ++ ++fn cpuset_to_u64(cpusetsize: size_t, cpuset: *const cpu_set_t) -> Result { ++ let bytes = cpuset_bytes(cpusetsize, cpuset)?; ++ let mut mask = 0_u64; ++ ++ for (byte_index, byte) in bytes.iter().copied().enumerate() { ++ for bit in 0..u8::BITS as usize { ++ if byte & (1 << bit) == 0 { ++ continue; ++ } ++ ++ let cpu = byte_index * u8::BITS as usize + bit; ++ if cpu >= RLCT_MAX_AFFINITY_CPUS { ++ return Err(Errno(EINVAL)); ++ } ++ ++ mask |= 1_u64 << cpu; ++ } ++ } ++ ++ Ok(mask) ++} ++ ++fn copy_u64_to_cpuset(mask: u64, cpusetsize: size_t, cpuset: *mut cpu_set_t) -> Result<(), Errno> { ++ let bytes = cpuset_bytes_mut(cpusetsize, cpuset)?; ++ bytes.fill(0); ++ ++ for (byte_index, dst) in bytes.iter_mut().take(RLCT_AFFINITY_BYTES).enumerate() { ++ *dst = (mask >> (byte_index * u8::BITS as usize)) as u8; ++ } ++ ++ Ok(()) ++} ++ ++#[cfg(target_os = "redox")] ++fn redox_set_thread_affinity(thread: &pthread::Pthread, mask: u64) -> Result<(), Errno> { ++ let mut kernel_cpuset = cpu_set_t::default(); ++ kernel_cpuset.__bits[0] = mask; ++ ++ let handle = FdGuard::new(unsafe { ++ syscall::dup(thread.os_tid.get().read().thread_fd, b"sched-affinity")? ++ }); ++ let _ = handle.write(unsafe { ++ core::slice::from_raw_parts( ++ core::ptr::from_ref(&kernel_cpuset).cast::(), ++ size_of::(), ++ ) ++ })?; ++ ++ Ok(()) ++} ++ ++#[cfg(target_os = "redox")] ++fn redox_get_thread_affinity(thread: &pthread::Pthread) -> Result { ++ let handle = FdGuard::new(unsafe { ++ syscall::dup(thread.os_tid.get().read().thread_fd, b"sched-affinity")? ++ }); ++ let mut kernel_cpuset = cpu_set_t::default(); ++ let _ = handle.read(unsafe { ++ core::slice::from_raw_parts_mut( ++ core::ptr::from_mut(&mut kernel_cpuset).cast::(), ++ size_of::(), ++ ) ++ })?; ++ ++ if kernel_cpuset.__bits[1..].iter().any(|bits| *bits != 0) { ++ return Err(Errno(EINVAL)); ++ } ++ ++ Ok(kernel_cpuset.__bits[0]) ++} ++ + #[derive(Clone)] + pub(crate) struct RlctAttr { + pub detachstate: c_uchar, +@@ -186,6 +290,43 @@ pub unsafe extern "C" fn pthread_getcpuclockid( + } + } + ++/// GNU extension. See . ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn pthread_getaffinity_np( ++ thread: pthread_t, ++ cpusetsize: size_t, ++ cpuset: *mut cpu_set_t, ++) -> c_int { ++ let thread: &pthread::Pthread = unsafe { &*thread.cast() }; ++ ++ let result = { ++ #[cfg(target_os = "redox")] ++ { ++ redox_get_thread_affinity(thread) ++ .and_then(|mask| copy_u64_to_cpuset(mask, cpusetsize, cpuset)) ++ } ++ ++ #[cfg(target_os = "linux")] ++ { ++ if cpuset.is_null() { ++ Err(Errno(EINVAL)) ++ } else { ++ e_raw(unsafe { ++ syscall!( ++ SCHED_GETAFFINITY, ++ thread.os_tid.get().read().thread_id, ++ cpusetsize, ++ cpuset.cast::() ++ ) ++ }) ++ .map(|_| ()) ++ } ++ } ++ }; ++ ++ e(result) ++} ++ + /// See . + #[unsafe(no_mangle)] + pub unsafe extern "C" fn pthread_getschedparam( +@@ -235,6 +376,43 @@ pub unsafe extern "C" fn pthread_self() -> pthread_t { + core::ptr::from_ref(unsafe { pthread::current_thread().unwrap_unchecked() }) as *mut _ + } + ++/// GNU extension. See . ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn pthread_setaffinity_np( ++ thread: pthread_t, ++ cpusetsize: size_t, ++ cpuset: *const cpu_set_t, ++) -> c_int { ++ let thread: &pthread::Pthread = unsafe { &*thread.cast() }; ++ ++ let result = { ++ #[cfg(target_os = "redox")] ++ { ++ cpuset_to_u64(cpusetsize, cpuset) ++ .and_then(|mask| redox_set_thread_affinity(thread, mask)) ++ } ++ ++ #[cfg(target_os = "linux")] ++ { ++ if cpuset.is_null() { ++ Err(Errno(EINVAL)) ++ } else { ++ e_raw(unsafe { ++ syscall!( ++ SCHED_SETAFFINITY, ++ thread.os_tid.get().read().thread_id, ++ cpusetsize, ++ cpuset.cast::() ++ ) ++ }) ++ .map(|_| ()) ++ } ++ } ++ }; ++ ++ e(result) ++} ++ + /// See . + #[unsafe(no_mangle)] + pub unsafe extern "C" fn pthread_setcancelstate(state: c_int, oldstate: *mut c_int) -> c_int { +@@ -307,6 +485,13 @@ pub unsafe extern "C" fn pthread_testcancel() { + unsafe { pthread::testcancel() }; + } + ++/// ++/// ++/// Non-standard GNU extension. Prefer `sched_yield()` instead. ++pub extern "C" fn pthread_yield() { ++ let _ = Sys::sched_yield(); ++} ++ + // Must be the same struct as defined in the pthread_cleanup_push macro. + #[repr(C)] + pub(crate) struct CleanupLinkedListEntry { +@@ -350,3 +535,82 @@ pub(crate) unsafe fn run_destructor_stack() { + (entry.routine)(entry.arg); + } + } ++ ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn pthread_setname_np(thread: pthread_t, name: *const c_char) -> c_int { ++ if name.is_null() { ++ return EINVAL; ++ } ++ ++ let cstr = unsafe { core::ffi::CStr::from_ptr(name) }; ++ let name_bytes = cstr.to_bytes(); ++ let len = name_bytes.len().min(31); ++ ++ #[cfg(target_os = "redox")] ++ { ++ let thread = unsafe { &*thread.cast::() }; ++ let os_tid = unsafe { thread.os_tid.get().read() }; ++ let path = alloc::format!("proc:{}/name", os_tid.thread_fd); ++ let fd = match Sys::open(&path, crate::header::fcntl::O_WRONLY, 0) { ++ Ok(fd) => fd, ++ Err(Errno(code)) => return code, ++ }; ++ ++ let result = match Sys::write(fd, &name_bytes[..len]) { ++ Ok(written) if written == len => 0, ++ Ok(_) => crate::header::errno::EIO, ++ Err(Errno(code)) => code, ++ }; ++ let _ = Sys::close(fd); ++ result ++ } ++ #[cfg(not(target_os = "redox"))] ++ { ++ let _ = thread; ++ 0 ++ } ++} ++ ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn pthread_getname_np( ++ thread: pthread_t, ++ name: *mut c_char, ++ len: size_t, ++) -> c_int { ++ if name.is_null() { ++ return EINVAL; ++ } ++ if len == 0 { ++ return ERANGE; ++ } ++ ++ #[cfg(target_os = "redox")] ++ { ++ let thread = unsafe { &*thread.cast::() }; ++ let os_tid = unsafe { thread.os_tid.get().read() }; ++ let path = alloc::format!("proc:{}/name", os_tid.thread_fd); ++ let fd = match Sys::open(&path, crate::header::fcntl::O_RDONLY, 0) { ++ Ok(fd) => fd, ++ Err(Errno(code)) => return code, ++ }; ++ ++ let mut buf = [0u8; 31]; ++ let result = match Sys::read(fd, &mut buf) { ++ Ok(read) if read < len => { ++ unsafe { core::ptr::copy_nonoverlapping(buf.as_ptr(), name.cast(), read) }; ++ unsafe { *name.add(read) = 0 }; ++ 0 ++ } ++ Ok(_) => ERANGE, ++ Err(Errno(code)) => code, ++ }; ++ let _ = Sys::close(fd); ++ result ++ } ++ #[cfg(not(target_os = "redox"))] ++ { ++ let _ = thread; ++ unsafe { *name = 0 }; ++ 0 ++ } ++} diff --git a/local/patches/relibc/P7-setpriority.patch b/local/patches/relibc/P7-setpriority.patch new file mode 100644 index 00000000..dcc499f2 --- /dev/null +++ b/local/patches/relibc/P7-setpriority.patch @@ -0,0 +1,104 @@ +diff --git a/src/platform/redox/mod.rs b/src/platform/redox/mod.rs +--- a/src/platform/redox/mod.rs ++++ b/src/platform/redox/mod.rs +@@ -77,11 +77,74 @@ static mut BRK_CUR: *mut c_void = ptr::null_mut(); + static mut BRK_END: *mut c_void = ptr::null_mut(); + + const PAGE_SIZE: usize = 4096; ++const NICE_MIN: c_int = -20; ++const NICE_MAX: c_int = 19; + + fn round_up_to_page_size(val: usize) -> Option { + val.checked_add(PAGE_SIZE) + .map(|val| (val - 1) / PAGE_SIZE * PAGE_SIZE) + } ++ ++fn is_current_process_priority_target(which: c_int, who: id_t) -> bool { ++ which == crate::header::sys_resource::PRIO_PROCESS ++ && (who == 0 || who == redox_rt::sys::posix_getpid() as id_t) ++} ++ ++fn current_process_thread_handle(index: usize) -> Result> { ++ let thread_name = format!("thread-{index}"); ++ match redox_rt::current_proc_fd().dup(thread_name.as_bytes()) { ++ Ok(thread_fd) => Ok(Some(thread_fd)), ++ Err(error) if error.errno == ENOENT => Ok(None), ++ Err(error) => Err(Errno(error.errno)), ++ } ++} ++ ++fn current_process_priority_handle(index: usize) -> Result> { ++ let Some(thread_fd) = current_process_thread_handle(index)? else { ++ return Ok(None); ++ }; ++ ++ thread_fd ++ .dup(b"priority") ++ .map(Some) ++ .map_err(|error| Errno(error.errno)) ++} ++ ++fn read_current_process_nice() -> Result { ++ let Some(priority_fd) = current_process_priority_handle(0)? else { ++ return Err(Errno(ESRCH)); ++ }; ++ ++ let mut nice_bytes = [0_u8; size_of::()]; ++ if priority_fd.read(&mut nice_bytes)? != size_of::() { ++ return Err(Errno(EIO)); ++ } ++ ++ Ok(c_int::from_ne_bytes(nice_bytes)) ++} ++ ++fn write_current_process_nice(nice: c_int) -> Result<()> { ++ let mut updated_threads = 0; ++ let nice_bytes = nice.to_ne_bytes(); ++ ++ for index in 0.. { ++ let Some(priority_fd) = current_process_priority_handle(index)? else { ++ break; ++ }; ++ ++ if priority_fd.write(&nice_bytes)? != nice_bytes.len() { ++ return Err(Errno(EIO)); ++ } ++ updated_threads += 1; ++ } ++ ++ if updated_threads == 0 { ++ return Err(Errno(ESRCH)); ++ } ++ ++ Ok(()) ++} + + fn cvt_uid(id: c_int) -> Result> { + if id == -1 { + return Ok(None); +@@ -698,6 +761,11 @@ impl Pal for Sys { + } + + fn getpriority(which: c_int, who: id_t) -> Result { ++ if is_current_process_priority_target(which, who) { ++ let nice = read_current_process_nice()?; ++ return Ok(20 - nice); ++ } ++ + match redox_rt::sys::posix_getpriority(which, who as u32) { + Ok(kernel_prio) => { + let posix_prio = (kernel_prio as i32 * -1) + 40 as i32; +@@ -1274,7 +1342,12 @@ impl Pal for Sys { + } + + fn setpriority(which: c_int, who: id_t, prio: c_int) -> Result<()> { +- let clamped_prio = prio.clamp(-20, 19); ++ let clamped_prio = prio.clamp(NICE_MIN, NICE_MAX); ++ ++ if is_current_process_priority_target(which, who) { ++ return write_current_process_nice(clamped_prio); ++ } ++ + let kernel_prio = (20 + clamped_prio) as u32; + + match redox_rt::sys::posix_setpriority(which, who as u32, kernel_prio) { diff --git a/local/patches/relibc/P9-pthread-open-fix.patch b/local/patches/relibc/P9-pthread-open-fix.patch new file mode 100644 index 00000000..efcee1b8 --- /dev/null +++ b/local/patches/relibc/P9-pthread-open-fix.patch @@ -0,0 +1,326 @@ +diff --git a/src/header/pthread/mod.rs b/src/header/pthread/mod.rs +index c742a42..008090a 100644 +--- a/src/header/pthread/mod.rs ++++ b/src/header/pthread/mod.rs +@@ -3,15 +3,26 @@ + //! See . + + use alloc::collections::LinkedList; +-use core::{cell::Cell, ptr::NonNull}; ++use core::{cell::Cell, mem::size_of, ptr::NonNull}; ++ ++#[cfg(target_os = "redox")] ++use redox_rt::proc::FdGuard; ++#[cfg(target_os = "linux")] ++use sc::syscall; ++#[cfg(target_os = "redox")] ++use syscall; + + use crate::{ + error::Errno, +- header::{bits_timespec::timespec, sched::*}, ++ header::{ ++ bits_timespec::timespec, ++ errno::{EINVAL, ERANGE}, ++ sched::*, ++ }, + platform::{ + Pal, Sys, + types::{ +- c_int, c_uchar, c_uint, c_void, clockid_t, pthread_attr_t, pthread_barrier_t, ++ c_char, c_int, c_uchar, c_uint, c_void, clockid_t, pthread_attr_t, pthread_barrier_t, + pthread_barrierattr_t, pthread_cond_t, pthread_condattr_t, pthread_key_t, + pthread_mutex_t, pthread_mutexattr_t, pthread_once_t, pthread_rwlock_t, + pthread_rwlockattr_t, pthread_spinlock_t, pthread_t, size_t, +@@ -20,6 +31,9 @@ use crate::{ + pthread, + }; + ++#[cfg(target_os = "linux")] ++use crate::platform::sys::e_raw; ++ + pub fn e(result: Result<(), Errno>) -> i32 { + match result { + Ok(()) => 0, +@@ -27,6 +41,96 @@ pub fn e(result: Result<(), Errno>) -> i32 { + } + } + ++const RLCT_AFFINITY_BYTES: usize = size_of::(); ++const RLCT_MAX_AFFINITY_CPUS: usize = u64::BITS as usize; ++ ++fn cpuset_bytes<'a>(cpusetsize: size_t, cpuset: *const cpu_set_t) -> Result<&'a [u8], Errno> { ++ if cpuset.is_null() || !(RLCT_AFFINITY_BYTES..=size_of::()).contains(&cpusetsize) { ++ return Err(Errno(EINVAL)); ++ } ++ ++ Ok(unsafe { core::slice::from_raw_parts(cpuset.cast::(), cpusetsize) }) ++} ++ ++fn cpuset_bytes_mut<'a>(cpusetsize: size_t, cpuset: *mut cpu_set_t) -> Result<&'a mut [u8], Errno> { ++ if cpuset.is_null() || !(RLCT_AFFINITY_BYTES..=size_of::()).contains(&cpusetsize) { ++ return Err(Errno(EINVAL)); ++ } ++ ++ Ok(unsafe { core::slice::from_raw_parts_mut(cpuset.cast::(), cpusetsize) }) ++} ++ ++fn cpuset_to_u64(cpusetsize: size_t, cpuset: *const cpu_set_t) -> Result { ++ let bytes = cpuset_bytes(cpusetsize, cpuset)?; ++ let mut mask = 0_u64; ++ ++ for (byte_index, byte) in bytes.iter().copied().enumerate() { ++ for bit in 0..u8::BITS as usize { ++ if byte & (1 << bit) == 0 { ++ continue; ++ } ++ ++ let cpu = byte_index * u8::BITS as usize + bit; ++ if cpu >= RLCT_MAX_AFFINITY_CPUS { ++ return Err(Errno(EINVAL)); ++ } ++ ++ mask |= 1_u64 << cpu; ++ } ++ } ++ ++ Ok(mask) ++} ++ ++fn copy_u64_to_cpuset(mask: u64, cpusetsize: size_t, cpuset: *mut cpu_set_t) -> Result<(), Errno> { ++ let bytes = cpuset_bytes_mut(cpusetsize, cpuset)?; ++ bytes.fill(0); ++ ++ for (byte_index, dst) in bytes.iter_mut().take(RLCT_AFFINITY_BYTES).enumerate() { ++ *dst = (mask >> (byte_index * u8::BITS as usize)) as u8; ++ } ++ ++ Ok(()) ++} ++ ++#[cfg(target_os = "redox")] ++fn redox_set_thread_affinity(thread: &pthread::Pthread, mask: u64) -> Result<(), Errno> { ++ let mut kernel_cpuset = cpu_set_t::default(); ++ kernel_cpuset.__bits[0] = mask; ++ ++ let handle = FdGuard::new(unsafe { ++ syscall::dup(thread.os_tid.get().read().thread_fd, b"sched-affinity")? ++ }); ++ let _ = handle.write(unsafe { ++ core::slice::from_raw_parts( ++ core::ptr::from_ref(&kernel_cpuset).cast::(), ++ size_of::(), ++ ) ++ })?; ++ ++ Ok(()) ++} ++ ++#[cfg(target_os = "redox")] ++fn redox_get_thread_affinity(thread: &pthread::Pthread) -> Result { ++ let handle = FdGuard::new(unsafe { ++ syscall::dup(thread.os_tid.get().read().thread_fd, b"sched-affinity")? ++ }); ++ let mut kernel_cpuset = cpu_set_t::default(); ++ let _ = handle.read(unsafe { ++ core::slice::from_raw_parts_mut( ++ core::ptr::from_mut(&mut kernel_cpuset).cast::(), ++ size_of::(), ++ ) ++ })?; ++ ++ if kernel_cpuset.__bits[1..].iter().any(|bits| *bits != 0) { ++ return Err(Errno(EINVAL)); ++ } ++ ++ Ok(kernel_cpuset.__bits[0]) ++} ++ + #[derive(Clone)] + pub(crate) struct RlctAttr { + pub detachstate: c_uchar, +@@ -186,6 +290,43 @@ pub unsafe extern "C" fn pthread_getcpuclockid( + } + } + ++/// GNU extension. See . ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn pthread_getaffinity_np( ++ thread: pthread_t, ++ cpusetsize: size_t, ++ cpuset: *mut cpu_set_t, ++) -> c_int { ++ let thread: &pthread::Pthread = unsafe { &*thread.cast() }; ++ ++ let result = { ++ #[cfg(target_os = "redox")] ++ { ++ redox_get_thread_affinity(thread) ++ .and_then(|mask| copy_u64_to_cpuset(mask, cpusetsize, cpuset)) ++ } ++ ++ #[cfg(target_os = "linux")] ++ { ++ if cpuset.is_null() { ++ Err(Errno(EINVAL)) ++ } else { ++ e_raw(unsafe { ++ syscall!( ++ SCHED_GETAFFINITY, ++ thread.os_tid.get().read().thread_id, ++ cpusetsize, ++ cpuset.cast::() ++ ) ++ }) ++ .map(|_| ()) ++ } ++ } ++ }; ++ ++ e(result) ++} ++ + /// See . + #[unsafe(no_mangle)] + pub unsafe extern "C" fn pthread_getschedparam( +@@ -235,6 +376,43 @@ pub unsafe extern "C" fn pthread_self() -> pthread_t { + core::ptr::from_ref(unsafe { pthread::current_thread().unwrap_unchecked() }) as *mut _ + } + ++/// GNU extension. See . ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn pthread_setaffinity_np( ++ thread: pthread_t, ++ cpusetsize: size_t, ++ cpuset: *const cpu_set_t, ++) -> c_int { ++ let thread: &pthread::Pthread = unsafe { &*thread.cast() }; ++ ++ let result = { ++ #[cfg(target_os = "redox")] ++ { ++ cpuset_to_u64(cpusetsize, cpuset) ++ .and_then(|mask| redox_set_thread_affinity(thread, mask)) ++ } ++ ++ #[cfg(target_os = "linux")] ++ { ++ if cpuset.is_null() { ++ Err(Errno(EINVAL)) ++ } else { ++ e_raw(unsafe { ++ syscall!( ++ SCHED_SETAFFINITY, ++ thread.os_tid.get().read().thread_id, ++ cpusetsize, ++ cpuset.cast::() ++ ) ++ }) ++ .map(|_| ()) ++ } ++ } ++ }; ++ ++ e(result) ++} ++ + /// See . + #[unsafe(no_mangle)] + pub unsafe extern "C" fn pthread_setcancelstate(state: c_int, oldstate: *mut c_int) -> c_int { +@@ -307,6 +485,13 @@ pub unsafe extern "C" fn pthread_testcancel() { + unsafe { pthread::testcancel() }; + } + ++/// ++/// ++/// Non-standard GNU extension. Prefer `sched_yield()` instead. ++pub extern "C" fn pthread_yield() { ++ let _ = Sys::sched_yield(); ++} ++ + // Must be the same struct as defined in the pthread_cleanup_push macro. + #[repr(C)] + pub(crate) struct CleanupLinkedListEntry { +@@ -350,3 +535,82 @@ pub(crate) unsafe fn run_destructor_stack() { + (entry.routine)(entry.arg); + } + } ++ ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn pthread_setname_np(thread: pthread_t, name: *const c_char) -> c_int { ++ if name.is_null() { ++ return EINVAL; ++ } ++ ++ let cstr = unsafe { core::ffi::CStr::from_ptr(name) }; ++ let name_bytes = cstr.to_bytes(); ++ let len = name_bytes.len().min(31); ++ ++ #[cfg(target_os = "redox")] ++ { ++ let thread = unsafe { &*thread.cast::() }; ++ let os_tid = unsafe { thread.os_tid.get().read() }; ++ let path = alloc::format!("proc:{}/name", os_tid.thread_fd); ++ let fd = match Sys::open(&path, crate::header::fcntl::O_WRONLY, 0) { ++ Ok(fd) => fd, ++ Err(Errno(code)) => return code, ++ }; ++ ++ let result = match Sys::write(fd, &name_bytes[..len]) { ++ Ok(written) if written == len => 0, ++ Ok(_) => crate::header::errno::EIO, ++ Err(Errno(code)) => code, ++ }; ++ let _ = Sys::close(fd); ++ result ++ } ++ #[cfg(not(target_os = "redox"))] ++ { ++ let _ = thread; ++ 0 ++ } ++} ++ ++#[unsafe(no_mangle)] ++pub unsafe extern "C" fn pthread_getname_np( ++ thread: pthread_t, ++ name: *mut c_char, ++ len: size_t, ++) -> c_int { ++ if name.is_null() { ++ return EINVAL; ++ } ++ if len == 0 { ++ return ERANGE; ++ } ++ ++ #[cfg(target_os = "redox")] ++ { ++ let thread = unsafe { &*thread.cast::() }; ++ let os_tid = unsafe { thread.os_tid.get().read() }; ++ let path = alloc::format!("proc:{}/name", os_tid.thread_fd); ++ let fd = match Sys::open(&path, crate::header::fcntl::O_RDONLY, 0) { ++ Ok(fd) => fd, ++ Err(Errno(code)) => return code, ++ }; ++ ++ let mut buf = [0u8; 31]; ++ let result = match Sys::read(fd, &mut buf) { ++ Ok(read) if read < len => { ++ unsafe { core::ptr::copy_nonoverlapping(buf.as_ptr(), name.cast(), read) }; ++ unsafe { *name.add(read) = 0 }; ++ 0 ++ } ++ Ok(_) => ERANGE, ++ Err(Errno(code)) => code, ++ }; ++ let _ = Sys::close(fd); ++ result ++ } ++ #[cfg(not(target_os = "redox"))] ++ { ++ let _ = thread; ++ unsafe { *name = 0 }; ++ 0 ++ } ++} diff --git a/local/patches/relibc/P9-spin-and-barrier.patch b/local/patches/relibc/P9-spin-and-barrier.patch new file mode 100644 index 00000000..51ecc3e9 --- /dev/null +++ b/local/patches/relibc/P9-spin-and-barrier.patch @@ -0,0 +1,43 @@ +diff --git a/src/sync/pthread_mutex.rs b/src/sync/pthread_mutex.rs +index 2871a6149..3c8e73f15 100644 +--- a/src/sync/pthread_mutex.rs ++++ b/src/sync/pthread_mutex.rs +@@ -35,7 +35,7 @@ const FUTEX_OWNER_DIED: u32 = 1 << 30; + const INDEX_MASK: u32 = !(WAITING_BIT | FUTEX_OWNER_DIED); + // TODO: Lower limit is probably better. + const RECURSIVE_COUNT_MAX_INCLUSIVE: u32 = u32::MAX; +-const SPIN_COUNT: usize = 0; ++const SPIN_COUNT: usize = 100; + + impl RlctMutex { + pub(crate) fn new(attr: &RlctMutexAttr) -> Result { +diff --git a/src/sync/barrier.rs b/src/sync/barrier.rs +index b5847b5..a8e3c2f0 100644 +--- a/src/sync/barrier.rs ++++ b/src/sync/barrier.rs +@@ -47,6 +47,9 @@ impl Barrier { + cvar: FutexState::new(count.get()), + } + } ++ pub fn destroy(&self) {} ++ + pub fn wait(&self) -> WaitResult { + let _ = &self.lock; + let sense = self.cvar.sense.load(Ordering::Acquire); +diff --git a/src/header/pthread/barrier.rs b/src/header/pthread/barrier.rs +index 1a5df3a..e69e2b9 100644 +--- a/src/header/pthread/barrier.rs ++++ b/src/header/pthread/barrier.rs +@@ -24,10 +24,10 @@ pub(crate) struct RlctBarrierAttr { + // Not async-signal-safe. + #[unsafe(no_mangle)] + pub unsafe extern "C" fn pthread_barrier_destroy(barrier: *mut pthread_barrier_t) -> c_int { +- // Behavior is undefined if any thread is currently waiting when this is called. +- +- // No-op, currently. +- unsafe { core::ptr::drop_in_place(barrier.cast::()) }; ++ let barrier = unsafe { &*barrier.cast::() }; ++ barrier.destroy(); + + 0 + } \ No newline at end of file diff --git a/local/patches/relibc/P9-spin-fix.patch b/local/patches/relibc/P9-spin-fix.patch new file mode 100644 index 00000000..1c5880a4 --- /dev/null +++ b/local/patches/relibc/P9-spin-fix.patch @@ -0,0 +1,380 @@ +diff --git a/src/sync/pthread_mutex.rs b/src/sync/pthread_mutex.rs +index 29bad63..af0c429 100644 +--- a/src/sync/pthread_mutex.rs ++++ b/src/sync/pthread_mutex.rs +@@ -1,3 +1,4 @@ ++use alloc::boxed::Box; + use core::{ + cell::Cell, + sync::atomic::{AtomicU32 as AtomicUint, Ordering}, +@@ -6,10 +7,9 @@ use core::{ + use crate::{ + error::Errno, + header::{bits_timespec::timespec, errno::*, pthread::*}, ++ platform::{Pal, Sys, types::c_int}, + }; + +-use crate::platform::{Pal, Sys, types::c_int}; +- + use super::FutexWaitResult; + + pub struct RlctMutex { +@@ -21,15 +21,22 @@ pub struct RlctMutex { + robust: bool, + } + ++pub struct RobustMutexNode { ++ pub next: *mut RobustMutexNode, ++ pub prev: *mut RobustMutexNode, ++ pub mutex: *const RlctMutex, ++} ++ + const STATE_UNLOCKED: u32 = 0; + const WAITING_BIT: u32 = 1 << 31; +-const INDEX_MASK: u32 = !WAITING_BIT; ++const FUTEX_OWNER_DIED: u32 = 1 << 30; ++const INDEX_MASK: u32 = !(WAITING_BIT | FUTEX_OWNER_DIED); + + // TODO: Lower limit is probably better. + const RECURSIVE_COUNT_MAX_INCLUSIVE: u32 = u32::MAX; + // TODO: How many spins should we do before it becomes more time-economical to enter kernel mode + // via futexes? +-const SPIN_COUNT: usize = 0; ++const SPIN_COUNT: usize = 100; + + impl RlctMutex { + pub(crate) fn new(attr: &RlctMutexAttr) -> Result { +@@ -69,13 +76,25 @@ impl RlctMutex { + Ok(0) + } + pub fn make_consistent(&self) -> Result<(), Errno> { +- todo_skip!(0, "pthread robust mutexes: not implemented"); +- Ok(()) ++ debug_assert!(self.robust, "make_consistent called on non-robust mutex"); ++ ++ if !self.robust { ++ return Err(Errno(EINVAL)); ++ } ++ ++ let current = self.inner.load(Ordering::Relaxed); ++ let owner = current & INDEX_MASK; ++ ++ if owner == os_tid_invalid_after_fork() && current & FUTEX_OWNER_DIED != 0 { ++ self.inner.store(0, Ordering::Release); ++ Ok(()) ++ } else { ++ Err(Errno(EINVAL)) ++ } + } + fn lock_inner(&self, deadline: Option<×pec>) -> Result<(), Errno> { + let this_thread = os_tid_invalid_after_fork(); +- +- //let mut spins_left = SPIN_COUNT; ++ let mut spins_left = SPIN_COUNT; + + loop { + let result = self.inner.compare_exchange_weak( +@@ -86,45 +105,59 @@ impl RlctMutex { + ); + + match result { +- // CAS succeeded +- Ok(_) => { +- if self.ty == Ty::Recursive { +- self.increment_recursive_count()?; +- } +- return Ok(()); +- } +- // CAS failed, but the mutex was recursive and we already own the lock. ++ Ok(_) => return self.finish_lock_acquire(false), + Err(thread) if thread & INDEX_MASK == this_thread && self.ty == Ty::Recursive => { + self.increment_recursive_count()?; + return Ok(()); + } +- // CAS failed, but the mutex was error-checking and we already own the lock. + Err(thread) if thread & INDEX_MASK == this_thread && self.ty == Ty::Errck => { +- return Err(Errno(EAGAIN)); ++ return Err(Errno(EDEADLK)); + } +- // CAS spuriously failed, simply retry the CAS. TODO: Use core::hint::spin_loop()? +- Err(thread) if thread & INDEX_MASK == 0 => { +- continue; ++ Err(thread) if thread & FUTEX_OWNER_DIED != 0 && thread & INDEX_MASK == 0 => { ++ return Err(Errno(ENOTRECOVERABLE)); + } +- // CAS failed because some other thread owned the lock. We must now wait. ++ Err(thread) if thread & FUTEX_OWNER_DIED != 0 => { ++ if !self.robust { ++ return Err(Errno(ENOTRECOVERABLE)); ++ } ++ ++ let new_value = (thread & WAITING_BIT) | FUTEX_OWNER_DIED | this_thread; ++ match self.inner.compare_exchange( ++ thread, ++ new_value, ++ Ordering::Acquire, ++ Ordering::Relaxed, ++ ) { ++ Ok(_) => return self.finish_lock_acquire(true), ++ Err(_) => continue, ++ } ++ } ++ Err(thread) if thread & INDEX_MASK == 0 => continue, + Err(thread) => { +- /*if spins_left > 0 { +- // TODO: Faster to spin trying to load the flag, compared to CAS? ++ let owner = thread & INDEX_MASK; ++ ++ if !crate::pthread::mutex_owner_id_is_live(owner) { ++ if !self.robust { ++ return Err(Errno(ENOTRECOVERABLE)); ++ } ++ ++ let new_value = (thread & WAITING_BIT) | FUTEX_OWNER_DIED | this_thread; ++ match self.inner.compare_exchange( ++ thread, ++ new_value, ++ Ordering::Acquire, ++ Ordering::Relaxed, ++ ) { ++ Ok(_) => return self.finish_lock_acquire(true), ++ Err(_) => continue, ++ } ++ } ++ ++ if spins_left > 0 { + spins_left -= 1; + core::hint::spin_loop(); + continue; + } +- +- spins_left = SPIN_COUNT; +- +- let inner = self.inner.fetch_or(WAITING_BIT, Ordering::Relaxed); +- +- if inner == STATE_UNLOCKED { +- continue; +- }*/ +- +- // If the mutex is not robust, simply futex_wait until unblocked. +- //crate::sync::futex_wait(&self.inner, inner | WAITING_BIT, None); + if crate::sync::futex_wait(&self.inner, thread, deadline) + == FutexWaitResult::TimedOut + { +@@ -140,6 +173,20 @@ impl RlctMutex { + pub fn lock_with_timeout(&self, deadline: ×pec) -> Result<(), Errno> { + self.lock_inner(Some(deadline)) + } ++ fn finish_lock_acquire(&self, owner_dead: bool) -> Result<(), Errno> { ++ if self.ty == Ty::Recursive { ++ self.increment_recursive_count()?; ++ } ++ if self.robust { ++ add_to_robust_list(self); ++ } ++ ++ if owner_dead { ++ Err(Errno(EOWNERDEAD)) ++ } else { ++ Ok(()) ++ } ++ } + fn increment_recursive_count(&self) -> Result<(), Errno> { + // We don't have to worry about asynchronous signals here, since pthread_mutex_trylock + // is not async-signal-safe. +@@ -161,41 +208,65 @@ impl RlctMutex { + pub fn try_lock(&self) -> Result<(), Errno> { + let this_thread = os_tid_invalid_after_fork(); + +- // TODO: If recursive, omitting CAS may be faster if it is already owned by this thread. +- let result = self.inner.compare_exchange( +- STATE_UNLOCKED, +- this_thread, +- Ordering::Acquire, +- Ordering::Relaxed, +- ); ++ loop { ++ let current = self.inner.load(Ordering::Relaxed); ++ ++ if current == STATE_UNLOCKED { ++ match self.inner.compare_exchange( ++ STATE_UNLOCKED, ++ this_thread, ++ Ordering::Acquire, ++ Ordering::Relaxed, ++ ) { ++ Ok(_) => return self.finish_lock_acquire(false), ++ Err(_) => continue, ++ } ++ } + +- if self.ty == Ty::Recursive { +- match result { +- Err(index) if index & INDEX_MASK != this_thread => return Err(Errno(EBUSY)), +- _ => (), ++ let owner = current & INDEX_MASK; ++ ++ if owner == this_thread && self.ty == Ty::Recursive { ++ self.increment_recursive_count()?; ++ return Ok(()); + } + +- self.increment_recursive_count()?; ++ if owner == this_thread && self.ty == Ty::Errck { ++ return Err(Errno(EDEADLK)); ++ } + +- return Ok(()); +- } ++ if current & FUTEX_OWNER_DIED != 0 && owner == 0 { ++ return Err(Errno(ENOTRECOVERABLE)); ++ } + +- match result { +- Ok(_) => Ok(()), +- Err(index) if index & INDEX_MASK == this_thread && self.ty == Ty::Errck => { +- Err(Errno(EDEADLK)) ++ if current & FUTEX_OWNER_DIED != 0 || (owner != 0 && !crate::pthread::mutex_owner_id_is_live(owner)) { ++ if !self.robust { ++ return Err(Errno(ENOTRECOVERABLE)); ++ } ++ ++ let new_value = (current & WAITING_BIT) | FUTEX_OWNER_DIED | this_thread; ++ match self.inner.compare_exchange( ++ current, ++ new_value, ++ Ordering::Acquire, ++ Ordering::Relaxed, ++ ) { ++ Ok(_) => return self.finish_lock_acquire(true), ++ Err(_) => continue, ++ } + } +- Err(_) => Err(Errno(EBUSY)), ++ ++ return Err(Errno(EBUSY)); + } + } + // Safe because we are not protecting any data. + pub fn unlock(&self) -> Result<(), Errno> { ++ let current = self.inner.load(Ordering::Relaxed); ++ + if self.robust || matches!(self.ty, Ty::Recursive | Ty::Errck) { +- if self.inner.load(Ordering::Relaxed) & INDEX_MASK != os_tid_invalid_after_fork() { ++ if current & INDEX_MASK != os_tid_invalid_after_fork() { + return Err(Errno(EPERM)); + } + +- // TODO: Is this fence correct? + core::sync::atomic::fence(Ordering::Acquire); + } + +@@ -208,18 +279,47 @@ impl RlctMutex { + } + } + +- self.inner.store(STATE_UNLOCKED, Ordering::Release); +- crate::sync::futex_wake(&self.inner, i32::MAX); +- /*let was_waiting = self.inner.swap(STATE_UNLOCKED, Ordering::Release) & WAITING_BIT != 0; ++ if self.robust { ++ remove_from_robust_list(self); ++ } + +- if was_waiting { +- let _ = crate::sync::futex_wake(&self.inner, 1); +- }*/ ++ let new_state = if self.robust && current & FUTEX_OWNER_DIED != 0 { ++ FUTEX_OWNER_DIED ++ } else { ++ STATE_UNLOCKED ++ }; ++ ++ self.inner.store(new_state, Ordering::Release); ++ crate::sync::futex_wake(&self.inner, i32::MAX); + + Ok(()) + } + } + ++pub(crate) unsafe fn mark_robust_mutexes_dead(thread: &crate::pthread::Pthread) { ++ let head = thread.robust_list_head.get(); ++ let this_thread = os_tid_invalid_after_fork(); ++ let mut node = unsafe { *head }; ++ ++ unsafe { *head = core::ptr::null_mut() }; ++ ++ while !node.is_null() { ++ let next = unsafe { (*node).next }; ++ let mutex = unsafe { &*(*node).mutex }; ++ let current = mutex.inner.load(Ordering::Relaxed); ++ ++ if current & INDEX_MASK == this_thread { ++ mutex ++ .inner ++ .store((current & WAITING_BIT) | FUTEX_OWNER_DIED | this_thread, Ordering::Release); ++ crate::sync::futex_wake(&mutex.inner, i32::MAX); ++ } ++ ++ unsafe { drop(Box::from_raw(node)) }; ++ node = next; ++ } ++} ++ + #[repr(u8)] + #[derive(PartialEq)] + enum Ty { +@@ -237,6 +337,54 @@ enum Ty { + #[thread_local] + static CACHED_OS_TID_INVALID_AFTER_FORK: Cell = Cell::new(0); + ++fn add_to_robust_list(mutex: &RlctMutex) { ++ let thread = crate::pthread::current_thread().expect("current thread not present"); ++ let node_ptr = Box::into_raw(Box::new(RobustMutexNode { ++ next: core::ptr::null_mut(), ++ prev: core::ptr::null_mut(), ++ mutex: core::ptr::from_ref(mutex), ++ })); ++ ++ unsafe { ++ let head = thread.robust_list_head.get(); ++ if !(*head).is_null() { ++ (**head).prev = node_ptr; ++ } ++ (*node_ptr).next = *head; ++ *head = node_ptr; ++ } ++} ++ ++fn remove_from_robust_list(mutex: &RlctMutex) { ++ let thread = match crate::pthread::current_thread() { ++ Some(thread) => thread, ++ None => return, ++ }; ++ ++ unsafe { ++ let mut node = *thread.robust_list_head.get(); ++ ++ while !node.is_null() { ++ if core::ptr::eq((*node).mutex, core::ptr::from_ref(mutex)) { ++ if !(*node).prev.is_null() { ++ (*(*node).prev).next = (*node).next; ++ } else { ++ *thread.robust_list_head.get() = (*node).next; ++ } ++ ++ if !(*node).next.is_null() { ++ (*(*node).next).prev = (*node).prev; ++ } ++ ++ drop(Box::from_raw(node)); ++ return; ++ } ++ ++ node = (*node).next; ++ } ++ } ++} ++ + // Assumes TIDs are unique between processes, which I only know is true for Redox. + fn os_tid_invalid_after_fork() -> u32 { + // TODO: Coordinate better if using shared == PTHREAD_PROCESS_SHARED, with up to 2^32 separate diff --git a/local/recipes/system/numad/recipe.toml b/local/recipes/system/numad/recipe.toml new file mode 100644 index 00000000..4e47e6bb --- /dev/null +++ b/local/recipes/system/numad/recipe.toml @@ -0,0 +1,5 @@ +[source] +path = "source" + +[build] +template = "cargo" diff --git a/local/recipes/system/numad/source/Cargo.toml b/local/recipes/system/numad/source/Cargo.toml new file mode 100644 index 00000000..76fcc5d7 --- /dev/null +++ b/local/recipes/system/numad/source/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "numad" +version = "0.1.0" +edition = "2021" +description = "Red Bear OS NUMA topology daemon — parses ACPI SRAT/SLIT and feeds kernel NUMA hints" + +[[bin]] +name = "numad" +path = "src/main.rs" diff --git a/local/recipes/system/numad/source/src/main.rs b/local/recipes/system/numad/source/src/main.rs new file mode 100644 index 00000000..43c063aa --- /dev/null +++ b/local/recipes/system/numad/source/src/main.rs @@ -0,0 +1,236 @@ +/// numad — Red Bear OS NUMA topology daemon +/// +/// Reads ACPI SRAT/SLIT from physical memory via /scheme/memory/physical +/// and feeds NUMA topology hints to the kernel for scheduler placement. +use std::fs; +use std::io::{Read, Write}; +use std::mem; + +const RSDP_SIGNATURE: &[u8; 8] = b"RSD PTR "; +const SRAT_SIGNATURE: &[u8; 4] = b"SRAT"; +const SLIT_SIGNATURE: &[u8; 4] = b"SLIT"; +const MAX_NUMA_NODES: usize = 8; + +#[repr(C, packed)] +#[derive(Copy, Clone)] +struct Rsdp { + signature: [u8; 8], + checksum: u8, + oem_id: [u8; 6], + revision: u8, + rsdt_addr: u32, +} + +#[repr(C, packed)] +#[derive(Copy, Clone)] +struct SdtHeader { + signature: [u8; 4], + length: u32, + revision: u8, + checksum: u8, + oem_id: [u8; 6], + oem_table_id: [u8; 8], + oem_revision: u32, + creator_id: u32, + creator_revision: u32, +} + +#[repr(C, packed)] +#[derive(Copy, Clone)] +struct SratEntry { + entry_type: u8, + length: u8, +} + +#[repr(C, packed)] +#[derive(Copy, Clone)] +struct SratProcessorApic { + entry: SratEntry, + proximity_domain_lo: u8, + apic_id: u8, + flags: u32, + local_sapic_eid: u8, + proximity_domain_hi: [u8; 3], + clock_domain: u32, +} + +#[repr(C, packed)] +#[derive(Copy, Clone)] +struct SratMemory { + entry: SratEntry, + proximity_domain: u32, + reserved: u16, + base_address: u64, + length: u64, + reserved2: [u8; 8], + flags: u32, + reserved3: [u8; 8], +} + +struct NumaNode { + id: u8, + apic_ids: Vec, +} + +fn main() { + eprintln!("numad: starting NUMA topology discovery"); + + // Read RSDP from known physical locations (EBDA or BIOS area) + let rsdp = match find_rsdp() { + Some(r) => r, + None => { + eprintln!("numad: no RSDP found, assuming UMA (single-node)"); + return; + } + }; + + // Read RSDT to find SRAT and SLIT + let sdt_addr = rsdp.rsdt_addr as usize; + let sdt_header = read_phys::(sdt_addr); + if &sdt_header.signature != b"RSDT" { + eprintln!("numad: no RSDT found"); + return; + } + + let num_entries = (sdt_header.length as usize - mem::size_of::()) / 4; + let entries_base = sdt_addr + mem::size_of::(); + + let mut srat_data: Option> = None; + let mut slit_data: Option> = None; + + for i in 0..num_entries { + let entry_addr = entries_base + i * 4; + let table_ptr: u32 = read_phys(entry_addr); + let table_addr = table_ptr as usize; + if table_addr == 0 { + continue; + } + let header = read_phys::(table_addr); + match &header.signature { + SRAT_SIGNATURE => { + srat_data = Some(read_phys_bytes(table_addr, header.length as usize)); + } + SLIT_SIGNATURE => { + slit_data = Some(read_phys_bytes(table_addr, header.length as usize)); + } + _ => {} + } + } + + let Some(srat) = srat_data else { + eprintln!("numad: no SRAT found, assuming UMA"); + return; + }; + + let mut nodes: Vec = Vec::new(); + let sdt_offset = mem::size_of::(); + let mut offset = sdt_offset; + + while offset + mem::size_of::() <= srat.len() { + let entry: &SratEntry = unsafe { &*(srat.as_ptr().add(offset) as *const SratEntry) }; + if entry.length < mem::size_of::() as u8 || offset + entry.length as usize > srat.len() { + break; + } + + match entry.entry_type { + 0 => { + // Processor Local APIC + if entry.length as usize >= mem::size_of::() { + let proc: &SratProcessorApic = unsafe { + &*(srat.as_ptr().add(offset) as *const SratProcessorApic) + }; + if proc.flags & 1 != 0 { + let proximity = proc.proximity_domain_lo; + while nodes.len() <= proximity as usize { + nodes.push(NumaNode { id: nodes.len() as u8, apic_ids: Vec::new() }); + } + nodes[proximity as usize].apic_ids.push(proc.apic_id); + } + } + } + _ => {} + } + offset += entry.length as usize; + } + + if nodes.is_empty() { + eprintln!("numad: no CPU entries in SRAT, assuming UMA"); + return; + } + + eprintln!("numad: found {} NUMA nodes", nodes.len()); + for node in &nodes { + eprintln!(" node {}: {} CPUs", node.id, node.apic_ids.len()); + } + + // Write topology hints to kernel via proc: scheme + // Format: "node_id,apic_id\n" per CPU + if let Ok(mut fd) = fs::OpenOptions::new().write(true).open("/scheme/proc/numa") { + for node in &nodes { + let mut line = format!("{},", node.id); + for apic_id in &node.apic_ids { + line.push_str(&format!("{},", apic_id)); + } + line.push('\n'); + let _ = fd.write_all(line.as_bytes()); + } + eprintln!("numad: topology hints written to kernel"); + } else { + eprintln!("numad: kernel NUMA interface not available (scheme:proc/numa)"); + } + + eprintln!("numad: NUMA topology discovery complete"); +} + +fn find_rsdp() -> Option { + // Search EBDA and BIOS areas for RSDP signature + let search_areas: &[(usize, usize)] = &[ + (0x000E_0000, 0x000F_FFFF), // BIOS ROM area + (0x0008_0000, 0x0009_FFFF), // EBDA/upper conventional + ]; + + for &(start, end) in search_areas { + for addr in (start..end).step_by(16) { + if addr + mem::size_of::() > end { + break; + } + let sig = read_phys_bytes(addr, 8); + if &sig == RSDP_SIGNATURE { + let rsdp: Rsdp = read_phys(addr); + if validate_checksum(&rsdp) { + return Some(rsdp); + } + } + } + } + None +} + +fn validate_checksum(rsdp: &Rsdp) -> bool { + let bytes = unsafe { + std::slice::from_raw_parts(rsdp as *const _ as *const u8, mem::size_of::()) + }; + bytes.iter().fold(0u8, |acc, &b| acc.wrapping_add(b)) == 0 +} + +fn read_phys(addr: usize) -> T { + let path = format!("/scheme/memory/physical@{}", addr); + if let Ok(mut fd) = fs::File::open(&path) { + let mut buf = vec![0u8; mem::size_of::()]; + if fd.read_exact(&mut buf).is_ok() { + return unsafe { std::ptr::read(buf.as_ptr() as *const T) }; + } + } + unsafe { std::mem::zeroed() } +} + +fn read_phys_bytes(addr: usize, len: usize) -> Vec { + let path = format!("/scheme/memory/physical@{}", addr); + if let Ok(mut fd) = fs::File::open(&path) { + let mut buf = vec![0u8; len]; + if fd.read_exact(&mut buf).is_ok() { + return buf; + } + } + vec![0u8; len] +} diff --git a/local/recipes/system/redbear-acmd/recipe.toml b/local/recipes/system/redbear-acmd/recipe.toml new file mode 100644 index 00000000..04d49d93 --- /dev/null +++ b/local/recipes/system/redbear-acmd/recipe.toml @@ -0,0 +1,8 @@ +[source] +path = "source" + +[build] +template = "cargo" + +[package.files] +"/usr/bin/redbear-acmd" = "redbear-acmd" diff --git a/recipes/core/base/recipe.toml b/recipes/core/base/recipe.toml index 405d9b6a..293c8dda 100644 --- a/recipes/core/base/recipe.toml +++ b/recipes/core/base/recipe.toml @@ -6,6 +6,8 @@ patches = [ "P0-workspace-add-bootstrap.patch", "P0-bootstrap-workspace-fix.patch", "P2-i2c-gpio-ucsi-drivers.patch", + "P3-pcid-bind-scheme.patch", + "P3-acpi-wave12-hardening.patch", ] [build] diff --git a/recipes/core/kernel/recipe.toml b/recipes/core/kernel/recipe.toml index a43deb48..264a631b 100644 --- a/recipes/core/kernel/recipe.toml +++ b/recipes/core/kernel/recipe.toml @@ -1,6 +1,6 @@ [source] git = "https://gitlab.redox-os.org/redox-os/kernel.git" -patches = ["redox.patch", "P0-canary.patch", "P1-memory-map-overflow.patch", "../../../local/patches/kernel/P4-supplementary-groups.patch"] +patches = ["redox.patch", "P0-canary.patch", "P1-memory-map-overflow.patch", "../../../local/patches/kernel/P4-supplementary-groups.patch", "../../../local/patches/kernel/P4-s3-suspend-resume.patch", "../../../local/patches/kernel/P5-sched-policy-context.patch", "../../../local/patches/kernel/P5-sched-rt-policy.patch", "../../../local/patches/kernel/P5-proc-setschedpolicy.patch", "../../../local/patches/kernel/P5-scheme-sched-id.patch", "../../../local/patches/kernel/P5-context-mod-sched.patch", "../../../local/patches/kernel/P6-vruntime-context.patch", "../../../local/patches/kernel/P6-percpu-runqueues.patch", "../../../local/patches/kernel/P6-futex-sharding.patch", "../../../local/patches/kernel/P6-vruntime-switch.patch", "../../../local/patches/kernel/P7-cache-affine-context.patch", "../../../local/patches/kernel/P7-cache-affine-switch.patch", "../../../local/patches/kernel/P7-proc-setname.patch", "../../../local/patches/kernel/P7-proc-setpriority.patch", "../../../local/patches/kernel/P8-futex-requeue.patch", "../../../local/patches/kernel/P8-futex-pi.patch", "../../../local/patches/kernel/P8-futex-robust.patch", "../../../local/patches/kernel/P8-percpu-wiring.patch", "../../../local/patches/kernel/P8-percpu-sched.patch", "../../../local/patches/kernel/P9-proc-lock-ordering.patch", "../../../local/patches/kernel/P9-futex-pi-cas-fix.patch"] [build] template = "custom" diff --git a/recipes/core/relibc/recipe.toml b/recipes/core/relibc/recipe.toml index 77ad807d..821000e1 100644 --- a/recipes/core/relibc/recipe.toml +++ b/recipes/core/relibc/recipe.toml @@ -22,6 +22,7 @@ patches = [ "../../../local/patches/relibc/P3-select-not-epoll-timeout.patch", "../../../local/patches/relibc/P3-tls-get-addr-panic-fix.patch", "../../../local/patches/relibc/P3-pthread-yield.patch", + "../../../local/patches/relibc/P3-barrier-smp-futex.patch", "../../../local/patches/relibc/P3-secure-getenv.patch", "../../../local/patches/relibc/P3-getentropy.patch", "../../../local/patches/relibc/P3-dup3.patch", @@ -38,10 +39,19 @@ patches = [ "../../../local/patches/relibc/P3-header-mod-spawn-threads.patch", "../../../local/patches/relibc/P3-spawn.patch", "../../../local/patches/relibc/P3-threads.patch", + "../../../local/patches/relibc/P3-pthread-signal-races.patch", "../../../local/patches/relibc/P3-sysv-ipc.patch", "../../../local/patches/relibc/P3-sysv-sem-impl.patch", "../../../local/patches/relibc/P3-sysv-shm-impl.patch", "../../../local/patches/relibc/P4-setgroups-getgroups.patch", + "../../../local/patches/relibc/P5-robust-mutexes.patch", + "../../../local/patches/relibc/P5-sched-api.patch", + "../../../local/patches/relibc/P5-pthread-sigmask-race.patch", + "../../../local/patches/relibc/P4-setgroups-unsafe-fix.patch", + "../../../local/patches/relibc/P7-setpriority.patch", + "../../../local/patches/relibc/P7-pthread-affinity.patch", + "../../../local/patches/relibc/P7-pthread-setname.patch", + "../../../local/patches/relibc/P9-spin-and-barrier.patch", ] [build] diff --git a/recipes/drivers/ehcid b/recipes/drivers/ehcid new file mode 120000 index 00000000..a356db53 --- /dev/null +++ b/recipes/drivers/ehcid @@ -0,0 +1 @@ +../../local/recipes/drivers/ehcid \ No newline at end of file diff --git a/recipes/drivers/ohcid b/recipes/drivers/ohcid new file mode 120000 index 00000000..98ff3190 --- /dev/null +++ b/recipes/drivers/ohcid @@ -0,0 +1 @@ +../../local/recipes/drivers/ohcid \ No newline at end of file diff --git a/recipes/drivers/redox-driver-core b/recipes/drivers/redox-driver-core new file mode 120000 index 00000000..048405e3 --- /dev/null +++ b/recipes/drivers/redox-driver-core @@ -0,0 +1 @@ +../../local/recipes/drivers/redox-driver-core \ No newline at end of file diff --git a/recipes/drivers/redox-driver-pci b/recipes/drivers/redox-driver-pci new file mode 120000 index 00000000..09991140 --- /dev/null +++ b/recipes/drivers/redox-driver-pci @@ -0,0 +1 @@ +../../local/recipes/drivers/redox-driver-pci \ No newline at end of file diff --git a/recipes/drivers/uhcid b/recipes/drivers/uhcid new file mode 120000 index 00000000..f96a3ee6 --- /dev/null +++ b/recipes/drivers/uhcid @@ -0,0 +1 @@ +../../local/recipes/drivers/uhcid \ No newline at end of file diff --git a/recipes/drivers/usb-core b/recipes/drivers/usb-core new file mode 120000 index 00000000..d6a7206c --- /dev/null +++ b/recipes/drivers/usb-core @@ -0,0 +1 @@ +../../local/recipes/drivers/usb-core \ No newline at end of file diff --git a/recipes/kde/kf6-pty b/recipes/kde/kf6-pty new file mode 120000 index 00000000..cdd7f724 --- /dev/null +++ b/recipes/kde/kf6-pty @@ -0,0 +1 @@ +../../local/recipes/kde/kf6-pty \ No newline at end of file diff --git a/recipes/libs/libiconv/recipe.toml b/recipes/libs/libiconv/recipe.toml index a70ef826..b91224c1 100644 --- a/recipes/libs/libiconv/recipe.toml +++ b/recipes/libs/libiconv/recipe.toml @@ -22,6 +22,137 @@ script = """ DYNAMIC_STATIC_INIT COOKBOOK_CONFIGURE_FLAGS+=( ac_cv_have_decl_program_invocation_name=no + ac_cv_objext=o + ac_cv_prog_cc_c_o=yes + ac_cv_exeext= + acl_cv_rpath=done ) + +# Restore the pristine configure scripts on every build, then layer our Redox +# cross-build fixes on top. Host autoconf 2.72 regenerates an invalid top-level +# configure for this recipe in our environment, so we patch the shipped script +# instead of regenerating it. +python3 - <<'PYEOF' +import os +import tarfile +from pathlib import Path + +source_dir = Path(os.environ["COOKBOOK_SOURCE"]) +source_tar = Path(os.environ["COOKBOOK_RECIPE"]) / "source.tar" +with tarfile.open(source_tar) as tf: + for relative in ("configure", "libcharset/configure"): + member = next(m for m in tf.getmembers() if m.name.endswith("/" + relative)) + target = source_dir / relative + target.write_text(tf.extractfile(member).read().decode("utf-8", errors="replace")) +PYEOF + +# Upgrade bundled libtool glue in both the top-level tree and nested +# libcharset tree to the current host libtool (2.6.0) so generated libtool +# helpers match the host ltmain.sh version. +for subdir in "${COOKBOOK_SOURCE}" "${COOKBOOK_SOURCE}/libcharset"; do + if [ -d "${subdir}" ]; then + mkdir -p "${subdir}/m4" "${subdir}/build-aux" + cp -f /usr/share/aclocal/libtool.m4 "${subdir}/m4/" + cp -f /usr/share/aclocal/ltoptions.m4 "${subdir}/m4/" + cp -f /usr/share/aclocal/ltsugar.m4 "${subdir}/m4/" + cp -f /usr/share/aclocal/ltversion.m4 "${subdir}/m4/" + cp -f /usr/share/aclocal/lt~obsolete.m4 "${subdir}/m4/" + cp -f /usr/share/libtool/build-aux/ltmain.sh "${subdir}/build-aux/" + fi +done + +if [ -d "${COOKBOOK_SOURCE}/libcharset" ]; then + ( + cd "${COOKBOOK_SOURCE}/libcharset" + cp -f ../srcm4/relocatable.m4 m4/ + cp -f ../srcm4/codeset.m4 m4/ + cp -f ../srcm4/fcntl-o.m4 m4/ + cp -f ../srcm4/visibility.m4 m4/ + ) +fi + +# libcharset templates currently keep @HAVE_VISIBILITY@ unsubstituted on our +# Redox cross build. Patch the source templates before configure so every +# generated header gets a stable fallback value. +for template in \ + "${COOKBOOK_SOURCE}/libcharset/include/libcharset.h.build.in" \ + "${COOKBOOK_SOURCE}/libcharset/include/localcharset.h.build.in" \ + "${COOKBOOK_SOURCE}/include/iconv.h.build.in" +do + if [ -f "${template}" ]; then + sed -i 's/@HAVE_VISIBILITY@/0/g' "${template}" + fi +done + +export CPP="${GNU_TARGET}-gcc -E" + +# Force cross mode in the shipped top-level configure and keep the rest of the +# generated shell structure intact. +sed -i '0,/cross_compiling=maybe/s//cross_compiling=yes/' "${COOKBOOK_SOURCE}/configure" +python3 - <<'PYEOF' +from pathlib import Path +import os +for relative in ('configure', 'libcharset/configure'): + path = Path(os.environ['COOKBOOK_SOURCE']) / relative + lines = path.read_text().splitlines() + for i, line in enumerate(lines): + if "macro_version='2.4.7'" in line or "macro_version='2.5.4-redox-9510'" in line: + lines[i] = "macro_version='2.6.0'" + if "macro_revision='2.4.7'" in line or "macro_revision='2.5.4-redox-9510'" in line: + lines[i] = "macro_revision='2.6.0'" + if "grep -v '^ *+' conftest.err >conftest.er1" in line: + lines[i] = "test -f conftest.err && grep -v '^ *+' conftest.err > conftest.er1.tmp && mv -f conftest.er1.tmp conftest.er1 || :" + if 'cat conftest.er1 >&5' in line: + lines[i] = 'test -f conftest.er1 && cat conftest.er1 >&5 || :' + if 'mv -f conftest.er1 conftest.err' in line: + lines[i] = 'test -f conftest.er1 && mv -f conftest.er1 conftest.err || :' + if line.strip() == 'rm -f conftest conftest$ac_cv_exeext': + lines[i] = 'rm -rf conftest conftest$ac_cv_exeext' + path.write_text("\\n".join(lines) + "\\n") +PYEOF + cookbook_configure -""" \ No newline at end of file + +# libcharset's configure currently leaves @HAVE_VISIBILITY@ unsubstituted in +# generated headers on our Redox cross build. Normalize the generated headers +# so the compile path matches the already-published libiconv artifact. +for header in \ + include/libcharset.h \ + include/localcharset.h \ + libcharset/include/libcharset.h \ + libcharset/include/localcharset.h +do + if [ -f "${header}" ]; then + sed -i 's/@HAVE_VISIBILITY@/0/g' "${header}" + fi +done + +# Force the nested libcharset configure step now, then patch the generated +# headers in the build tree before the top-level make descends into libcharset. +if [ -d "libcharset" ]; then + ( + cd libcharset + "${COOKBOOK_SOURCE}/libcharset/configure" \ + --disable-option-checking \ + --prefix=/usr \ + --host="${GNU_TARGET}" \ + --enable-shared \ + --enable-static \ + ac_cv_have_decl_program_invocation_name=no \ + CC="${GNU_TARGET}-gcc" \ + LDFLAGS="${LDFLAGS}" \ + CPPFLAGS="${CPPFLAGS}" \ + --cache-file=/dev/null \ + --srcdir="${COOKBOOK_SOURCE}/libcharset" + ) + for header in \ + libcharset/include/libcharset.h \ + libcharset/include/localcharset.h + do + if [ -f "${header}" ]; then + sed -i 's/@HAVE_VISIBILITY@/0/g' "${header}" + fi + done +fi + +""" diff --git a/recipes/system/cpufreqd b/recipes/system/cpufreqd new file mode 120000 index 00000000..ac730d25 --- /dev/null +++ b/recipes/system/cpufreqd @@ -0,0 +1 @@ +../../local/recipes/system/cpufreqd \ No newline at end of file diff --git a/recipes/system/driver-manager b/recipes/system/driver-manager new file mode 120000 index 00000000..3c51d18b --- /dev/null +++ b/recipes/system/driver-manager @@ -0,0 +1 @@ +../../local/recipes/system/driver-manager \ No newline at end of file diff --git a/recipes/system/hwrngd b/recipes/system/hwrngd new file mode 120000 index 00000000..1ecbb035 --- /dev/null +++ b/recipes/system/hwrngd @@ -0,0 +1 @@ +../../local/recipes/system/hwrngd \ No newline at end of file diff --git a/recipes/system/numad b/recipes/system/numad new file mode 120000 index 00000000..70671291 --- /dev/null +++ b/recipes/system/numad @@ -0,0 +1 @@ +../../local/recipes/system/numad \ No newline at end of file diff --git a/recipes/system/thermald b/recipes/system/thermald new file mode 120000 index 00000000..c13e2187 --- /dev/null +++ b/recipes/system/thermald @@ -0,0 +1 @@ +../../local/recipes/system/thermald \ No newline at end of file