From 82feefbaee9fcab402ddd79f897ceb15713e3c6b Mon Sep 17 00:00:00 2001 From: Red Bear OS Date: Sat, 27 Jun 2026 09:19:25 +0300 Subject: [PATCH] Red Bear OS kernel baseline From release 0.1.0 pre-patched archive. This includes all Red Bear modifications previously maintained as patches in local/patches/kernel/. --- .gitignore | 3 + .gitlab-ci.yml | 90 + .gitmodules | 4 + .helix/config.toml | 2 + .helix/languages.toml | 13 + ARM-AARCH64-PORT-OUTLINE.md | 79 + Cargo.lock | 421 +++ Cargo.toml | 120 + LICENSE | 21 + Makefile | 66 + README.md | 81 + build.rs | 100 + clippy.sh | 7 + config.toml.example | 7 + linkers/aarch64.ld | 55 + linkers/i586.ld | 51 + linkers/riscv64.ld | 61 + linkers/x86_64.ld | 60 + res/unifont.font | Bin 0 -> 4096 bytes rmm/Cargo.toml | 16 + rmm/README.md | 4 + rmm/src/allocator/frame/buddy.rs | 296 ++ rmm/src/allocator/frame/bump.rs | 79 + rmm/src/allocator/frame/mod.rs | 83 + rmm/src/allocator/mod.rs | 3 + rmm/src/arch/aarch64.rs | 153 + rmm/src/arch/emulate.rs | 355 ++ rmm/src/arch/mod.rs | 93 + rmm/src/arch/riscv64/mod.rs | 7 + rmm/src/arch/riscv64/sv39.rs | 124 + rmm/src/arch/riscv64/sv48.rs | 118 + rmm/src/arch/riscv64/sv57.rs | 116 + rmm/src/arch/x86.rs | 80 + rmm/src/arch/x86_64.rs | 107 + rmm/src/arch/x86_shared.rs | 37 + rmm/src/lib.rs | 97 + rmm/src/main.rs | 309 ++ rmm/src/page/entry.rs | 59 + rmm/src/page/flags.rs | 157 + rmm/src/page/flush.rs | 71 + rmm/src/page/mapper.rs | 269 ++ rmm/src/page/mod.rs | 7 + rmm/src/page/table.rs | 105 + rust-toolchain.toml | 3 + rustfmt.toml | 22 + src/acpi/gtdt.rs | 64 + src/acpi/hpet.rs | 121 + src/acpi/madt/arch/aarch64.rs | 97 + src/acpi/madt/arch/other.rs | 9 + src/acpi/madt/arch/x86.rs | 160 + src/acpi/madt/mod.rs | 240 ++ src/acpi/mod.rs | 212 ++ src/acpi/rsdp.rs | 34 + src/acpi/rsdt.rs | 52 + src/acpi/rxsdt.rs | 6 + src/acpi/sdt.rs | 27 + src/acpi/spcr.rs | 140 + src/acpi/xsdt.rs | 50 + src/allocator/linked_list.rs | 50 + src/allocator/mod.rs | 48 + src/arch/aarch64/consts.rs | 15 + src/arch/aarch64/debug.rs | 19 + src/arch/aarch64/device/cpu/mod.rs | 277 ++ .../device/cpu/registers/control_regs.rs | 167 + .../aarch64/device/cpu/registers/id_regs.rs | 151 + src/arch/aarch64/device/cpu/registers/mod.rs | 2 + src/arch/aarch64/device/generic_timer.rs | 145 + src/arch/aarch64/device/irqchip/gic.rs | 288 ++ src/arch/aarch64/device/irqchip/gicv3.rs | 196 ++ .../aarch64/device/irqchip/irq_bcm2835.rs | 299 ++ .../aarch64/device/irqchip/irq_bcm2836.rs | 231 ++ src/arch/aarch64/device/irqchip/mod.rs | 41 + src/arch/aarch64/device/irqchip/null.rs | 41 + src/arch/aarch64/device/mod.rs | 60 + src/arch/aarch64/device/rtc.rs | 41 + src/arch/aarch64/device/serial.rs | 59 + src/arch/aarch64/interrupt/exception.rs | 236 ++ src/arch/aarch64/interrupt/handler.rs | 420 +++ src/arch/aarch64/interrupt/irq.rs | 56 + src/arch/aarch64/interrupt/mod.rs | 49 + src/arch/aarch64/interrupt/syscall.rs | 49 + src/arch/aarch64/interrupt/trace.rs | 32 + src/arch/aarch64/ipi.rs | 30 + src/arch/aarch64/misc.rs | 23 + src/arch/aarch64/mod.rs | 71 + src/arch/aarch64/paging.rs | 7 + src/arch/aarch64/start.rs | 148 + src/arch/aarch64/stop.rs | 33 + src/arch/aarch64/time.rs | 18 + src/arch/aarch64/vectors.rs | 112 + src/arch/mod.rs | 27 + src/arch/riscv64/consts.rs | 16 + src/arch/riscv64/debug.rs | 19 + src/arch/riscv64/device/cpu/mod.rs | 5 + src/arch/riscv64/device/irqchip/clint.rs | 42 + src/arch/riscv64/device/irqchip/clint_sbi.rs | 150 + src/arch/riscv64/device/irqchip/hlic.rs | 170 + src/arch/riscv64/device/irqchip/mod.rs | 44 + src/arch/riscv64/device/irqchip/plic.rs | 198 ++ src/arch/riscv64/device/mod.rs | 110 + src/arch/riscv64/device/serial.rs | 47 + src/arch/riscv64/interrupt/exception.rs | 229 ++ src/arch/riscv64/interrupt/handler.rs | 332 ++ src/arch/riscv64/interrupt/mod.rs | 39 + src/arch/riscv64/interrupt/syscall.rs | 1 + src/arch/riscv64/interrupt/trace.rs | 35 + src/arch/riscv64/ipi.rs | 34 + src/arch/riscv64/misc.rs | 47 + src/arch/riscv64/mod.rs | 69 + src/arch/riscv64/paging.rs | 5 + src/arch/riscv64/start.rs | 140 + src/arch/riscv64/stop.rs | 15 + src/arch/riscv64/time.rs | 34 + src/arch/x86/consts.rs | 20 + src/arch/x86/interrupt/handler.rs | 471 +++ src/arch/x86/interrupt/mod.rs | 10 + src/arch/x86/interrupt/syscall.rs | 48 + src/arch/x86/mod.rs | 41 + src/arch/x86_64/alternative.rs | 317 ++ src/arch/x86_64/consts.rs | 26 + src/arch/x86_64/interrupt/handler.rs | 532 +++ src/arch/x86_64/interrupt/mod.rs | 10 + src/arch/x86_64/interrupt/syscall.rs | 195 ++ src/arch/x86_64/macros.rs | 80 + src/arch/x86_64/misc.rs | 29 + src/arch/x86_64/mod.rs | 50 + src/arch/x86_shared/cpuid.rs | 29 + src/arch/x86_shared/debug.rs | 62 + src/arch/x86_shared/device/cpu.rs | 281 ++ src/arch/x86_shared/device/hpet.rs | 125 + src/arch/x86_shared/device/ioapic.rs | 427 +++ src/arch/x86_shared/device/local_apic.rs | 272 ++ src/arch/x86_shared/device/mod.rs | 94 + src/arch/x86_shared/device/pic.rs | 98 + src/arch/x86_shared/device/pit.rs | 50 + src/arch/x86_shared/device/rtc.rs | 0 src/arch/x86_shared/device/serial.rs | 48 + src/arch/x86_shared/device/system76_ec.rs | 89 + src/arch/x86_shared/device/tsc.rs | 161 + src/arch/x86_shared/gdt.rs | 431 +++ src/arch/x86_shared/idt.rs | 361 ++ src/arch/x86_shared/interrupt/exception.rs | 294 ++ src/arch/x86_shared/interrupt/ipi.rs | 28 + src/arch/x86_shared/interrupt/irq.rs | 352 ++ src/arch/x86_shared/interrupt/mod.rs | 44 + src/arch/x86_shared/interrupt/trace.rs | 33 + src/arch/x86_shared/ipi.rs | 53 + src/arch/x86_shared/mod.rs | 45 + src/arch/x86_shared/paging.rs | 10 + src/arch/x86_shared/pti.rs | 86 + src/arch/x86_shared/start.rs | 221 ++ src/arch/x86_shared/stop.rs | 122 + src/arch/x86_shared/time.rs | 69 + src/asm/x86/trampoline.asm | 160 + src/asm/x86_64/trampoline.asm | 168 + src/common/aligned_box.rs | 128 + src/common/int_like.rs | 161 + src/common/mod.rs | 27 + src/context/arch/aarch64.rs | 391 +++ src/context/arch/riscv64.rs | 224 ++ src/context/arch/x86.rs | 315 ++ src/context/arch/x86_64.rs | 395 +++ src/context/context.rs | 1074 ++++++ src/context/file.rs | 104 + src/context/memory.rs | 2984 +++++++++++++++++ src/context/mod.rs | 324 ++ src/context/signal.rs | 105 + src/context/switch.rs | 577 ++++ src/context/timeout.rs | 82 + src/cpu_set.rs | 134 + src/cpu_stats.rs | 163 + src/debugger.rs | 366 ++ src/devices/graphical_debug/debug.rs | 155 + src/devices/graphical_debug/mod.rs | 70 + src/devices/mod.rs | 4 + src/devices/serial.rs | 72 + src/devices/uart_16550.rs | 163 + src/devices/uart_pl011.rs | 254 ++ src/dtb/irqchip.rs | 422 +++ src/dtb/mod.rs | 246 ++ src/dtb/serial.rs | 69 + src/event.rs | 264 ++ src/log.rs | 73 + src/macros.rs | 56 + src/main.rs | 145 + src/memory/kernel_mapper.rs | 91 + src/memory/mod.rs | 1090 ++++++ src/memory/page.rs | 85 + src/panic.rs | 202 ++ src/percpu.rs | 205 ++ src/profiling.rs | 330 ++ src/ptrace.rs | 248 ++ src/scheme/acpi.rs | 336 ++ src/scheme/debug.rs | 237 ++ src/scheme/dtb.rs | 163 + src/scheme/event.rs | 130 + src/scheme/irq.rs | 570 ++++ src/scheme/memory.rs | 326 ++ src/scheme/mod.rs | 795 +++++ src/scheme/pipe.rs | 521 +++ src/scheme/proc.rs | 1570 +++++++++ src/scheme/serio.rs | 157 + src/scheme/sys/block.rs | 34 + src/scheme/sys/context.rs | 186 + src/scheme/sys/cpu.rs | 16 + src/scheme/sys/exe.rs | 11 + src/scheme/sys/fdstat.rs | 107 + src/scheme/sys/iostat.rs | 129 + src/scheme/sys/irq.rs | 17 + src/scheme/sys/log.rs | 16 + src/scheme/sys/mod.rs | 333 ++ src/scheme/sys/stat.rs | 96 + src/scheme/sys/syscall.rs | 39 + src/scheme/sys/uname.rs | 12 + src/scheme/time.rs | 217 ++ src/scheme/user.rs | 2164 ++++++++++++ src/startup/memory.rs | 447 +++ src/startup/mod.rs | 238 ++ src/sync/mod.rs | 5 + src/sync/ordered.rs | 734 ++++ src/sync/wait_condition.rs | 147 + src/sync/wait_queue.rs | 106 + src/syscall/debug.rs | 316 ++ src/syscall/fs.rs | 779 +++++ src/syscall/futex.rs | 222 ++ src/syscall/mod.rs | 258 ++ src/syscall/process.rs | 294 ++ src/syscall/time.rs | 91 + src/syscall/usercopy.rs | 255 ++ src/time.rs | 36 + targets/aarch64-unknown-kernel.json | 24 + targets/i586-unknown-kernel.json | 26 + targets/riscv64-unknown-kernel.json | 24 + targets/x86_64-unknown-kernel.json | 25 + 234 files changed, 40494 insertions(+) create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 .gitmodules create mode 100644 .helix/config.toml create mode 100644 .helix/languages.toml create mode 100644 ARM-AARCH64-PORT-OUTLINE.md create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.md create mode 100644 build.rs create mode 100755 clippy.sh create mode 100644 config.toml.example create mode 100644 linkers/aarch64.ld create mode 100644 linkers/i586.ld create mode 100644 linkers/riscv64.ld create mode 100644 linkers/x86_64.ld create mode 100644 res/unifont.font create mode 100644 rmm/Cargo.toml create mode 100644 rmm/README.md create mode 100644 rmm/src/allocator/frame/buddy.rs create mode 100644 rmm/src/allocator/frame/bump.rs create mode 100644 rmm/src/allocator/frame/mod.rs create mode 100644 rmm/src/allocator/mod.rs create mode 100644 rmm/src/arch/aarch64.rs create mode 100644 rmm/src/arch/emulate.rs create mode 100644 rmm/src/arch/mod.rs create mode 100644 rmm/src/arch/riscv64/mod.rs create mode 100644 rmm/src/arch/riscv64/sv39.rs create mode 100644 rmm/src/arch/riscv64/sv48.rs create mode 100644 rmm/src/arch/riscv64/sv57.rs create mode 100644 rmm/src/arch/x86.rs create mode 100644 rmm/src/arch/x86_64.rs create mode 100644 rmm/src/arch/x86_shared.rs create mode 100644 rmm/src/lib.rs create mode 100644 rmm/src/main.rs create mode 100644 rmm/src/page/entry.rs create mode 100644 rmm/src/page/flags.rs create mode 100644 rmm/src/page/flush.rs create mode 100644 rmm/src/page/mapper.rs create mode 100644 rmm/src/page/mod.rs create mode 100644 rmm/src/page/table.rs create mode 100644 rust-toolchain.toml create mode 100644 rustfmt.toml create mode 100644 src/acpi/gtdt.rs create mode 100644 src/acpi/hpet.rs create mode 100644 src/acpi/madt/arch/aarch64.rs create mode 100644 src/acpi/madt/arch/other.rs create mode 100644 src/acpi/madt/arch/x86.rs create mode 100644 src/acpi/madt/mod.rs create mode 100644 src/acpi/mod.rs create mode 100644 src/acpi/rsdp.rs create mode 100644 src/acpi/rsdt.rs create mode 100644 src/acpi/rxsdt.rs create mode 100644 src/acpi/sdt.rs create mode 100644 src/acpi/spcr.rs create mode 100644 src/acpi/xsdt.rs create mode 100644 src/allocator/linked_list.rs create mode 100644 src/allocator/mod.rs create mode 100644 src/arch/aarch64/consts.rs create mode 100644 src/arch/aarch64/debug.rs create mode 100644 src/arch/aarch64/device/cpu/mod.rs create mode 100644 src/arch/aarch64/device/cpu/registers/control_regs.rs create mode 100644 src/arch/aarch64/device/cpu/registers/id_regs.rs create mode 100644 src/arch/aarch64/device/cpu/registers/mod.rs create mode 100644 src/arch/aarch64/device/generic_timer.rs create mode 100644 src/arch/aarch64/device/irqchip/gic.rs create mode 100644 src/arch/aarch64/device/irqchip/gicv3.rs create mode 100644 src/arch/aarch64/device/irqchip/irq_bcm2835.rs create mode 100644 src/arch/aarch64/device/irqchip/irq_bcm2836.rs create mode 100644 src/arch/aarch64/device/irqchip/mod.rs create mode 100644 src/arch/aarch64/device/irqchip/null.rs create mode 100644 src/arch/aarch64/device/mod.rs create mode 100644 src/arch/aarch64/device/rtc.rs create mode 100644 src/arch/aarch64/device/serial.rs create mode 100644 src/arch/aarch64/interrupt/exception.rs create mode 100644 src/arch/aarch64/interrupt/handler.rs create mode 100644 src/arch/aarch64/interrupt/irq.rs create mode 100644 src/arch/aarch64/interrupt/mod.rs create mode 100644 src/arch/aarch64/interrupt/syscall.rs create mode 100644 src/arch/aarch64/interrupt/trace.rs create mode 100644 src/arch/aarch64/ipi.rs create mode 100644 src/arch/aarch64/misc.rs create mode 100644 src/arch/aarch64/mod.rs create mode 100644 src/arch/aarch64/paging.rs create mode 100644 src/arch/aarch64/start.rs create mode 100644 src/arch/aarch64/stop.rs create mode 100644 src/arch/aarch64/time.rs create mode 100644 src/arch/aarch64/vectors.rs create mode 100644 src/arch/mod.rs create mode 100644 src/arch/riscv64/consts.rs create mode 100644 src/arch/riscv64/debug.rs create mode 100644 src/arch/riscv64/device/cpu/mod.rs create mode 100644 src/arch/riscv64/device/irqchip/clint.rs create mode 100644 src/arch/riscv64/device/irqchip/clint_sbi.rs create mode 100644 src/arch/riscv64/device/irqchip/hlic.rs create mode 100644 src/arch/riscv64/device/irqchip/mod.rs create mode 100644 src/arch/riscv64/device/irqchip/plic.rs create mode 100644 src/arch/riscv64/device/mod.rs create mode 100644 src/arch/riscv64/device/serial.rs create mode 100644 src/arch/riscv64/interrupt/exception.rs create mode 100644 src/arch/riscv64/interrupt/handler.rs create mode 100644 src/arch/riscv64/interrupt/mod.rs create mode 100644 src/arch/riscv64/interrupt/syscall.rs create mode 100644 src/arch/riscv64/interrupt/trace.rs create mode 100644 src/arch/riscv64/ipi.rs create mode 100644 src/arch/riscv64/misc.rs create mode 100644 src/arch/riscv64/mod.rs create mode 100644 src/arch/riscv64/paging.rs create mode 100644 src/arch/riscv64/start.rs create mode 100644 src/arch/riscv64/stop.rs create mode 100644 src/arch/riscv64/time.rs create mode 100644 src/arch/x86/consts.rs create mode 100644 src/arch/x86/interrupt/handler.rs create mode 100644 src/arch/x86/interrupt/mod.rs create mode 100644 src/arch/x86/interrupt/syscall.rs create mode 100644 src/arch/x86/mod.rs create mode 100644 src/arch/x86_64/alternative.rs create mode 100644 src/arch/x86_64/consts.rs create mode 100644 src/arch/x86_64/interrupt/handler.rs create mode 100644 src/arch/x86_64/interrupt/mod.rs create mode 100644 src/arch/x86_64/interrupt/syscall.rs create mode 100644 src/arch/x86_64/macros.rs create mode 100644 src/arch/x86_64/misc.rs create mode 100644 src/arch/x86_64/mod.rs create mode 100644 src/arch/x86_shared/cpuid.rs create mode 100644 src/arch/x86_shared/debug.rs create mode 100644 src/arch/x86_shared/device/cpu.rs create mode 100644 src/arch/x86_shared/device/hpet.rs create mode 100644 src/arch/x86_shared/device/ioapic.rs create mode 100644 src/arch/x86_shared/device/local_apic.rs create mode 100644 src/arch/x86_shared/device/mod.rs create mode 100644 src/arch/x86_shared/device/pic.rs create mode 100644 src/arch/x86_shared/device/pit.rs create mode 100644 src/arch/x86_shared/device/rtc.rs create mode 100644 src/arch/x86_shared/device/serial.rs create mode 100644 src/arch/x86_shared/device/system76_ec.rs create mode 100644 src/arch/x86_shared/device/tsc.rs create mode 100644 src/arch/x86_shared/gdt.rs create mode 100644 src/arch/x86_shared/idt.rs create mode 100644 src/arch/x86_shared/interrupt/exception.rs create mode 100644 src/arch/x86_shared/interrupt/ipi.rs create mode 100644 src/arch/x86_shared/interrupt/irq.rs create mode 100644 src/arch/x86_shared/interrupt/mod.rs create mode 100644 src/arch/x86_shared/interrupt/trace.rs create mode 100644 src/arch/x86_shared/ipi.rs create mode 100644 src/arch/x86_shared/mod.rs create mode 100644 src/arch/x86_shared/paging.rs create mode 100644 src/arch/x86_shared/pti.rs create mode 100644 src/arch/x86_shared/start.rs create mode 100644 src/arch/x86_shared/stop.rs create mode 100644 src/arch/x86_shared/time.rs create mode 100644 src/asm/x86/trampoline.asm create mode 100644 src/asm/x86_64/trampoline.asm create mode 100644 src/common/aligned_box.rs create mode 100644 src/common/int_like.rs create mode 100644 src/common/mod.rs create mode 100644 src/context/arch/aarch64.rs create mode 100644 src/context/arch/riscv64.rs create mode 100644 src/context/arch/x86.rs create mode 100644 src/context/arch/x86_64.rs create mode 100644 src/context/context.rs create mode 100644 src/context/file.rs create mode 100644 src/context/memory.rs create mode 100644 src/context/mod.rs create mode 100644 src/context/signal.rs create mode 100644 src/context/switch.rs create mode 100644 src/context/timeout.rs create mode 100644 src/cpu_set.rs create mode 100644 src/cpu_stats.rs create mode 100644 src/debugger.rs create mode 100644 src/devices/graphical_debug/debug.rs create mode 100644 src/devices/graphical_debug/mod.rs create mode 100644 src/devices/mod.rs create mode 100644 src/devices/serial.rs create mode 100644 src/devices/uart_16550.rs create mode 100644 src/devices/uart_pl011.rs create mode 100644 src/dtb/irqchip.rs create mode 100644 src/dtb/mod.rs create mode 100644 src/dtb/serial.rs create mode 100644 src/event.rs create mode 100644 src/log.rs create mode 100644 src/macros.rs create mode 100644 src/main.rs create mode 100644 src/memory/kernel_mapper.rs create mode 100644 src/memory/mod.rs create mode 100644 src/memory/page.rs create mode 100644 src/panic.rs create mode 100644 src/percpu.rs create mode 100644 src/profiling.rs create mode 100644 src/ptrace.rs create mode 100644 src/scheme/acpi.rs create mode 100644 src/scheme/debug.rs create mode 100644 src/scheme/dtb.rs create mode 100644 src/scheme/event.rs create mode 100644 src/scheme/irq.rs create mode 100644 src/scheme/memory.rs create mode 100644 src/scheme/mod.rs create mode 100644 src/scheme/pipe.rs create mode 100644 src/scheme/proc.rs create mode 100644 src/scheme/serio.rs create mode 100644 src/scheme/sys/block.rs create mode 100644 src/scheme/sys/context.rs create mode 100644 src/scheme/sys/cpu.rs create mode 100644 src/scheme/sys/exe.rs create mode 100644 src/scheme/sys/fdstat.rs create mode 100644 src/scheme/sys/iostat.rs create mode 100644 src/scheme/sys/irq.rs create mode 100644 src/scheme/sys/log.rs create mode 100644 src/scheme/sys/mod.rs create mode 100644 src/scheme/sys/stat.rs create mode 100644 src/scheme/sys/syscall.rs create mode 100644 src/scheme/sys/uname.rs create mode 100644 src/scheme/time.rs create mode 100644 src/scheme/user.rs create mode 100644 src/startup/memory.rs create mode 100644 src/startup/mod.rs create mode 100644 src/sync/mod.rs create mode 100644 src/sync/ordered.rs create mode 100644 src/sync/wait_condition.rs create mode 100644 src/sync/wait_queue.rs create mode 100644 src/syscall/debug.rs create mode 100644 src/syscall/fs.rs create mode 100644 src/syscall/futex.rs create mode 100644 src/syscall/mod.rs create mode 100644 src/syscall/process.rs create mode 100644 src/syscall/time.rs create mode 100644 src/syscall/usercopy.rs create mode 100644 src/time.rs create mode 100644 targets/aarch64-unknown-kernel.json create mode 100644 targets/i586-unknown-kernel.json create mode 100644 targets/riscv64-unknown-kernel.json create mode 100644 targets/x86_64-unknown-kernel.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..7774614018 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +target +/config.toml +.gitlab-ci-local/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000..7d82652f70 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,90 @@ +image: "redoxos/redoxer:latest" + +variables: + GIT_SUBMODULE_STRATEGY: recursive + +workflow: + rules: + - if: '$CI_PROJECT_NAMESPACE == "redox-os"' + - if: '$CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "master"' + +stages: + - build + - cross-build + - test + - other-features + # TODO: benchmarks and profiling (maybe manually enabled for relevant MRs)? + +x86_64: + stage: build + script: + - mkdir -p target/${ARCH} + - redoxer env make BUILD=target/${ARCH} + variables: + ARCH: "x86_64" + +aarch64: + stage: cross-build + image: "redoxos/redoxer:aarch64" + script: + - mkdir -p target/${ARCH} + - redoxer env make BUILD=target/${ARCH} + variables: + ARCH: "aarch64" + +i586: + stage: cross-build + script: + - mkdir -p target/${ARCH} + - TARGET=${ARCH}-unknown-redox redoxer env make BUILD=target/${ARCH} + variables: + ARCH: "i586" + +riscv64gc: + stage: cross-build + script: + - mkdir -p target/${ARCH} + - TARGET=${ARCH}-unknown-redox redoxer env make BUILD=target/${ARCH} + variables: + ARCH: "riscv64gc" + +fmt: + stage: build + script: + - rustup component add rustfmt + - rustfmt --check + +x86_64:boot: + stage: test + needs: [x86_64] + script: + - mkdir -p target/${ARCH} + - export COOKBOOK_SOURCE_IDENT=$CI_COMMIT_SHA + - redoxer env make BUILD=target/${ARCH} + - timeout -s KILL 9m redoxer exec --folder target/${ARCH}/:/usr/lib/boot uname -a + variables: + ARCH: "x86_64" + +x86_64:relibc: + stage: test + needs: [x86_64] + script: + - redoxer pkg relibc-tests-bins + - export COOKBOOK_SOURCE_IDENT=$CI_COMMIT_SHA + - mkdir -p target/${TARGET}/sysroot/{usr/lib/boot,root} target/${TARGET}/root + - redoxer env make BUILD=target/${TARGET}/sysroot/usr/lib/boot + - (cd target/${TARGET}/sysroot && mv home/user/relibc-tests/* root/) + - timeout -s KILL 9m redoxer exec --folder target/${TARGET}/sysroot/:/ make run + # It is fine if failing sometimes + allow_failure: true + variables: + TARGET: "x86_64-unknown-redox" + +profiling-compile: + stage: other-features + allow_failure: true + script: + make check + variables: + ARCH: "x86_64" + KERNEL_CHECK_FEATURES: profiling diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..67f90d4b97 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "redox-path"] + path = redox-path + url = https://gitlab.redox-os.org/redox-os/redox-path.git + branch = main diff --git a/.helix/config.toml b/.helix/config.toml new file mode 100644 index 0000000000..a1ec3e0a6a --- /dev/null +++ b/.helix/config.toml @@ -0,0 +1,2 @@ +[editor] +auto-format = false diff --git a/.helix/languages.toml b/.helix/languages.toml new file mode 100644 index 0000000000..c86c7b8a5e --- /dev/null +++ b/.helix/languages.toml @@ -0,0 +1,13 @@ +[[language]] +name = "rust" + +[[language-server.rust-analyzer.config.cargo]] +extraEnv = ["RUST_TARGET_PATH=targets"] +# Select one of targets to make lsp work for your confguration +# Do not commit this change +# TODO: find a better way to do this +# target = "aarch64-unknown-kernel" + +[[language-server.rust-analyzer.config.check]] +targets = ["x86_64-unknown-kernel", "i686-unknown-kernel", "aarch64-unknown-kernel"] + diff --git a/ARM-AARCH64-PORT-OUTLINE.md b/ARM-AARCH64-PORT-OUTLINE.md new file mode 100644 index 0000000000..22925209b9 --- /dev/null +++ b/ARM-AARCH64-PORT-OUTLINE.md @@ -0,0 +1,79 @@ +# Porting the core Redox kernel to arm AArch64: An outline + +## Intro + +This document is [my](https://github.com/raw-bin) attempt at: + +* Capturing thinking on the work needed for a core Redox kernel port +* Sharing progress with the community as things evolve +* Creating a template that can be used for ports to other architectures + +Core Redox kernel means everything needed to get to a non-graphical console-only multi-user shell. + +Only the 64-bit execution state (AArch64) with the 64-bit instruction set architecture (A64) shall be supported for the moment. For more background/context read [this](https://developer.arm.com/products/architecture/a-profile/docs/den0024/latest/introduction). + +This document is intended to be kept *live*. It will be updated to reflect the current state of work and any feedback received. + +It is hard~futile to come up with a strict sequence of work for such ports but this document is a reasonable template to follow. + +## Intended target platform + +The primary focus is on [qemu's virt machine platform emulation for the AArch64 architecture](https://github.com/qemu/qemu/blob/master/hw/arm/virt.c#L127). + +Targeting a virtual platform is a convenient way to bring up the mechanics of architectural support and makes the jump to silicon easier. The preferred boot chain for AArch64 (explained later) is well supported on this platform and boot-over-tftp from localhost makes the debug cycle very efficient. + +Once the core kernel port is complete a similar follow on document will be created that is dedicated to silicon bring-up. + +## Boot protocol elements + +| Item | Notes | +|------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Linux kernel boot protocol for AArch64](https://www.kernel.org/doc/Documentation/arm64/booting.txt) | The linked document describes assumptions made from the bootloader which are field tested and worthwhile to have for Redox an AArch64.
The intent is to consider most of the document except anything tied to the Linux kernel itself. | +| [Flattened Device Tree](https://elinux.org/Device_Tree_Reference) | FDT binary blobs supplied by the bootloader shall provide the Redox kernel with misc platform \{memory, interrupt, devicemem} maps. Qemu's virt machine platform synthetically creates an FDT blob at a specific address which is very handy. | + +## Boot flow elements + +The following table lists the boot flow in order. + +| Item | Notes | +|-------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [ARM Trusted Firmware (TF-A)](https://github.com/ARM-software/arm-trusted-firmware) | TF-A is a de-facto standard reference firmware implementation and proven in the field.
TF-A runs post power-on on Armv8-A implementations and eventually hands off to further stages of the boot flow.
For qemu's virt machine platform, it is essentially absent but I mean to rely on it heavily for silicon bring up hence mentioning it here. | +| [u-boot](https://www.denx.de/wiki/U-Boot) | u-boot will handle early console access, media access for fetching redox kernel images from non-volatile storage/misc disk subsystems/off the network.
u-boot supports loading EFI applications. If EFI support to AArch64 Redox is added in the future that should essentially work out of the box.
u-boot will load redox and FDT binary blobs into RAM and jump to the redox kernel. | +| Redox early-init stub | For AArch64, the redox kernel will contain an A64 assembly stub that will setup the MMU from scratch. This is akin to the [x86_64 redox bootloader](https://github.com/redox-os/bootloader/blob/master/x86_64/startup-x86_64.asm).
This stub sets up identity maps for MMU initialization, maps the kernel image itself as well as the device memory for the UART console. At present this stub shall be a part of the kernel itself for simplicity. | +| Redox kstart entry | The early init stub hands off here. kstart will then re-init the MMU more comprehensively. | + +## Supported devices + +The following devices shall be supported. All necessary information specific to these devices will be provided to the redox kernel by the platform specific FDT binary blob. + +| Device | Notes | +|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Generic Interrupt Controller v2](https://developer.arm.com/products/architecture/a-profile/docs/ihi0048/b/arm-generic-interrupt-controller-architecture-version-20-architecture-specification) | The GIC is an Arm-v8A architectural element and is supported by all architecturally compliant processor implementations. GICv2 is supported by qemu's virt machine emulation and most subsequent GIC implementations are backward compatible to GICv2. | +| [Generic Timer](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0500d/BGBBIJCB.html) | The Generic Timer Architecture is an Arm-v8A architectural element and is implemented by all compliant processor implementations. It is supported by qemu. | +| [PrimeCell UART PL011](http://infocenter.arm.com/help/topic/com.arm.doc.ddi0183f/DDI0183.pdf) | The PL011 UART is supported by qemu and most ARM systems. | + +## Intended development sequence and status + +| Item | Description | Status | Notes | +|--------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------|-------------------------------------------------------------------------------| +| Redox AArch64 toolchain | Create an usable redox AArch64 toolchain specification | Done | Using this JSON spec in isolated tests produces valid AArch64 soft float code | +| Stubbed kernel image | Stub out AArch64 kernel support using the existing x86_64 arch code as a template
Modify redox kernel build glue and work iteratively to get a linkable (non-functional) image | Not done yet | | +| Boot flow | Create a self hosted u-boot -> redox kernel workflow
Should obtain the stubbed image from a local TFTP server, load it into RAM and jump to it | Not done yet | | +| GDB Debug flow | Create a debug workflow centered around qemu's GDB stub
This should allow connecting to qemu's GDB stub and debug u-boot/redox stub via a GDB client and single stepping through code | Not done yet | | +| Verify Redox entry | Verify that control reaches the redox kernel from u-boot | Not done yet | | +| AArch64 early init stub | Add support for raw asm code for early AArch64 init in the redox kernel
Verify that this code is located appropriately in the link map and that control reaches this code from u-boot | Not done yet | | +| Basic DTB support | Integrate the [device_tree crate](https://mbr.github.io/device_tree-rs/device_tree/)
Use the crate to access the qemu supplied DTB image and extract the memory map | Not done yet | | +| Basic UART support | Use the device_tree crate to get the UART address from the DTB image and set up the initial console
This is a polling mode only setup | Not done yet | | +| Initial MMU support | Implement initial MMU support in the early init stub
This forces the MMU into a clean state overriding any bootloader specific setup
Create an identity map for MMU init
Create a mapping for the kernel image
Create a mapping for any devices needed at this stage (UART) | Not done yet | | +| kmain entry | Verify that kmain entry works post early MMU init | Not done yet | | +| Basic Redox MMU support | Get Redox to create a final set of mappings for everything
Verify that this works as expected | Not done yet | | +| Basic libc support | Flesh out a basic set of libc calls as required for simple user-land apps | Not done yet | | +| userspace_init entry | Verify user-space entry and /sbin/init invocation | Not done yet | | +| Basic Interrupt controller support | Add a GIC driver
Verify functionality | Not done yet | | +| Basic Timer support | Add a Generic Timer driver
Verify functionality | Not done yet | | +| UART interrupt support | Add support for UART interrupts | Not done yet | | +| Task context switch support | Add context switching support
Verify functionality | Not done yet | | +| Login shell | Iteratively add and verify multi-user login shell support | Not done yet | | +| Publish development branch on github | Work with the community to post work done after employer approval | Not done yet | | +| Break out the Bubbly | Drink copious quantities of alcohol to celebrate | Not done yet | | +| Silicon bring-up | Plan silicon bring-up | Not done yet | | diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000000..44f84c4740 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,421 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "bit_field" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6" + +[[package]] +name = "bitfield" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + +[[package]] +name = "cc" +version = "1.2.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "fdt" +version = "0.2.0-alpha1" +source = "git+https://github.com/repnop/fdt.git?rev=2fb1409edd1877c714a0aa36b6a7c5351004be54#2fb1409edd1877c714a0aa36b6a7c5351004be54" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.0", +] + +[[package]] +name = "kernel" +version = "0.5.12" +dependencies = [ + "arrayvec", + "bitfield", + "bitflags 2.11.1", + "cc", + "fdt", + "hashbrown 0.14.5", + "linked_list_allocator", + "object", + "raw-cpuid", + "redox-path", + "redox_syscall", + "rmm", + "rustc-demangle", + "sbi-rt", + "slab", + "smallvec", + "spin", + "toml", + "x86", +] + +[[package]] +name = "linked_list_allocator" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "549ce1740e46b291953c4340adcd74c59bcf4308f4cac050fd33ba91b7168f4a" +dependencies = [ + "spinning_top", +] + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "raw-cpuid" +version = "10.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c297679cb867470fa8c9f67dbba74a78d78e3e98d7cf2b08d6d71540f797332" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox-path" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64072665120942deff5fd5425d6c1811b854f4939e7f1c01ce755f64432bbea7" + +[[package]] +name = "redox_syscall" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a" +dependencies = [ + "bitflags 2.11.1", +] + +[[package]] +name = "rmm" +version = "0.1.0" +dependencies = [ + "bitflags 2.11.1", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + +[[package]] +name = "sbi-rt" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fbaa69be1eedc61c426e6d489b2260482e928b465360576900d52d496a58bd0" +dependencies = [ + "sbi-spec", +] + +[[package]] +name = "sbi-spec" +version = "0.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e36312fb5ddc10d08ecdc65187402baba4ac34585cb9d1b78522ae2358d890" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + +[[package]] +name = "spinning_top" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b9eb1a2f4c41445a3a0ff9abc5221c5fcd28e1f13cd7c0397706f9ac938ddb0" +dependencies = [ + "lock_api", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + +[[package]] +name = "x86" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55b5be8cc34d017d8aabec95bc45a43d0f20e8b2a31a453cabc804fe996f8dca" +dependencies = [ + "bit_field", + "bitflags 1.3.2", + "raw-cpuid", +] + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000000..6d4f059ace --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,120 @@ +[workspace] +resolver = "3" + +[package] +name = "kernel" +version = "0.5.12" +build = "build.rs" +edition = "2024" + +[build-dependencies] +cc = "1.0" +toml = "0.8" + +[dependencies] +arrayvec = { version = "0.7.4", default-features = false } +bitfield = "0.13.2" +bitflags = "2" +fdt = { git = "https://github.com/repnop/fdt.git", rev = "2fb1409edd1877c714a0aa36b6a7c5351004be54" } +hashbrown = { version = "0.14.3", default-features = false, features = ["ahash", "inline-more"] } +linked_list_allocator = "0.9.0" +redox-path = "0.2.0" +redox_syscall = { version = "0.7.4", default-features = false } +rmm = { path = "rmm", default-features = false } +slab = { version = "0.4", default-features = false } +smallvec = { version = "1.15.1", default-features = false } +spin = { version = "0.9.8" } + +[dependencies.object] +version = "0.37.1" +default-features = false +features = ["read_core", "elf"] + +[dependencies.rustc-demangle] +version = "0.1.16" +default-features = false + +[lints.clippy] +# Overflows are very, very bad in kernel code as it may provide an attack vector for +# userspace applications, and it is only checked in debug builds +# TODO: address occurrences and then deny +arithmetic_side_effects = "warn" +cast_ptr_alignment = "warn" # TODO: address occurrences and then deny +identity_op = "allow" # Used to allow stuff like 1 << 0 and 1 * 1024 * 1024 +if_same_then_else = "allow" # Useful for adding comments about different branches +# Indexing a slice can cause panics and that is something we always want to avoid +# in kernel code. Use .get and return an error instead +# TODO: address occurrences and then deny +indexing_slicing = "warn" +many_single_char_names = "allow" # Useful in the syscall function +module_inception = "allow" # Used for context::context +# Not implementing default is sometimes useful in the case something has significant cost +# to allocate. If you implement default, it can be allocated without evidence using the +# ..Default::default() syntax. Not fun in kernel space +new_without_default = "allow" +not_unsafe_ptr_arg_deref = "deny" +or_fun_call = "allow" # Used to make it nicer to return errors, for example, .ok_or(Error::new(ESRCH)) +precedence = "deny" +ptr_cast_constness = "deny" +too_many_arguments = "allow" # This is needed in some cases, like for syscall +# Avoid panicking in the kernel without information about the panic. Use expect +# TODO: address occurrences and then deny +unwrap_used = "warn" + +[lints.rust] +static_mut_refs = "warn" # FIXME deny once all occurrences are fixed +# This is usually a serious issue - a missing import of a define where it is interpreted +# as a catch-all variable in a match, for example +unreachable_patterns = "deny" +unused_must_use = "deny" # Ensure that all must_use results are used + +[target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies] +raw-cpuid = "10.2.0" +x86 = { version = "0.47.0", default-features = false } + +[target.'cfg(any(target_arch = "riscv64", target_arch = "riscv32"))'.dependencies] +sbi-rt = "0.0.3" + +[features] +default = [ + "acpi", + #"debugger", + "multi_core", + "serial_debug", + "self_modifying", + "x86_kvm_pv", + #"busy_panic", + #"drop_panic", + #"syscall_debug" +] + +# Activates some limited code-overwriting optimizations, based on CPU features. +self_modifying = [] + +acpi = [] +lpss_debug = [] +multi_core = ["acpi"] +profiling = [] +#TODO: remove when threading issues are fixed +pti = [] +drop_panic = [] +busy_panic = [] +qemu_debug = [] +serial_debug = [] +system76_ec_debug = [] +x86_kvm_pv = [] + +debugger = ["syscall_debug"] +syscall_debug = [] + +sys_fdstat = [] + +[profile.dev] +# Avoids having to define the eh_personality lang item and reduces kernel size +panic = "abort" + +[profile.release] +# Avoids having to define the eh_personality lang item and reduces kernel size +panic = "abort" +#lto = true +debug = "full" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..eeb7504cd1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Jeremy Soller + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..68a8c50ae5 --- /dev/null +++ b/Makefile @@ -0,0 +1,66 @@ +.PHONY: all check + +SOURCE:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) +BUILD?=$(CURDIR) +export RUST_TARGET_PATH=$(SOURCE)/targets + +ifeq ($(TARGET),) + ARCH?=$(shell uname -m) +else + ARCH?=$(shell echo "$(TARGET)" | cut -d - -f1) +endif + +ifeq ($(ARCH),riscv64gc) + override ARCH:=riscv64 + GNU_TARGET=riscv64-unknown-redox +else ifeq ($(ARCH),i686) + override ARCH:=i586 + GNU_TARGET=i686-unknown-redox +else + GNU_TARGET=$(ARCH)-unknown-redox +endif + + +all: $(BUILD)/kernel $(BUILD)/kernel.sym + +LD_SCRIPT=$(SOURCE)/linkers/$(ARCH).ld +LOCKFILE=$(SOURCE)/Cargo.lock +MANIFEST=$(SOURCE)/Cargo.toml +TARGET_SPEC=$(RUST_TARGET_PATH)/$(ARCH)-unknown-kernel.json + +KERNEL_CARGO_FEATURES?= + +$(BUILD)/kernel.all: $(LD_SCRIPT) $(LOCKFILE) $(MANIFEST) $(TARGET_SPEC) $(shell find $(SOURCE) -name "*.rs" -type f) + cargo rustc \ + --bin kernel \ + --manifest-path "$(MANIFEST)" \ + --target "$(TARGET_SPEC)" \ + --release \ + -Z build-std=core,alloc -Zbuild-std-features=compiler-builtins-mem \ + --features=$(KERNEL_CARGO_FEATURES) \ + -- \ + -C link-arg=-T -Clink-arg="$(LD_SCRIPT)" \ + -C link-arg=-z -Clink-arg=max-page-size=0x1000 \ + --emit link="$(BUILD)/kernel.all" + +$(BUILD)/kernel.sym: $(BUILD)/kernel.all + $(GNU_TARGET)-objcopy \ + --only-keep-debug \ + "$(BUILD)/kernel.all" \ + "$(BUILD)/kernel.sym" + +$(BUILD)/kernel: $(BUILD)/kernel.all + $(GNU_TARGET)-objcopy \ + --strip-debug \ + "$(BUILD)/kernel.all" \ + "$(BUILD)/kernel" + +KERNEL_CHECK_FEATURES?= + +check: + cargo check \ + --bin kernel \ + --manifest-path "$(MANIFEST)" \ + --target "$(TARGET_SPEC)" \ + -Z build-std=core,alloc -Zbuild-std-features=compiler-builtins-mem \ + --features=$(KERNEL_CHECK_FEATURES) diff --git a/README.md b/README.md new file mode 100644 index 0000000000..cf54d099e5 --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +# Kernel + +Redox OS Microkernel + +[![docs](https://img.shields.io/badge/docs-master-blue.svg)](https://docs.rs/redox_syscall/latest/syscall/) +[![SLOCs counter](https://tokei.rs/b1/github/redox-os/kernel?category=code)](https://github.com/XAMPPRocky/tokei) +[![MIT licensed](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE) + +## Requirements + +* [`nasm`](https://nasm.us/) needs to be available on the PATH at build time. + +## Building The Documentation + +Use this command: + +```sh +cargo doc --open --target x86_64-unknown-none +``` + +## Debugging + +### QEMU + +Running [QEMU](https://www.qemu.org) with the `-s` flag will set up QEMU to listen on port `1234` for a GDB client to connect to it. To debug the redox kernel run. + +```sh +make qemu gdb=yes +``` + +This will start a virtual machine with and listen on port `1234` for a GDB or LLDB client. + +### GDB + +If you are going to use [GDB](https://www.gnu.org/software/gdb/), run these commands to load debug symbols and connect to your running kernel: + +``` +(gdb) symbol-file build/kernel.sym +(gdb) target remote localhost:1234 +``` + +### LLDB + +If you are going to use [LLDB](https://lldb.llvm.org/), run these commands to start debugging: + +``` +(lldb) target create -s build/kernel.sym build/kernel +(lldb) gdb-remote localhost:1234 +``` + +After connecting to your kernel you can set some interesting breakpoints and `continue` +the process. See your debuggers man page for more information on useful commands to run. + +## Notes + +- Always use `foo.get(n)` instead of `foo[n]` and try to cover for the possibility of `Option::None`. Doing the regular way may work fine for applications, but never in the kernel. No possible panics should ever exist in kernel space, because then the whole OS would just stop working. + +- If you receive a kernel panic in QEMU, use `pkill qemu-system` to kill the frozen QEMU process. + +## How To Contribute + +To learn how to contribute to this system component you need to read the following document: + +- [CONTRIBUTING.md](https://gitlab.redox-os.org/redox-os/redox/-/blob/master/CONTRIBUTING.md) + +## Development + +To learn how to do development with this system component inside the Redox build system you need to read the [Build System](https://doc.redox-os.org/book/build-system-reference.html) and [Coding and Building](https://doc.redox-os.org/book/coding-and-building.html) pages. + +### How To Build + +To build this system component you need to download the Redox build system, you can learn how to do it on the [Building Redox](https://doc.redox-os.org/book/podman-build.html) page. + +This is necessary because they only work with cross-compilation to a Redox virtual machine, but you can do some testing from Linux. + +## Funding - _Unix-style Signals and Process Management_ + +This project is funded through [NGI Zero Core](https://nlnet.nl/core), a fund established by [NLnet](https://nlnet.nl) with financial support from the European Commission's [Next Generation Internet](https://ngi.eu) program. Learn more at the [NLnet project page](https://nlnet.nl/project/RedoxOS-Signals). + +[NLnet foundation logo](https://nlnet.nl) +[NGI Zero Logo](https://nlnet.nl/core) diff --git a/build.rs b/build.rs new file mode 100644 index 0000000000..96c3ea5c78 --- /dev/null +++ b/build.rs @@ -0,0 +1,100 @@ +#![allow(clippy::unwrap_used)] // the build script can panic + +use std::{env, path::Path, process::Command}; +use toml::Table; + +fn parse_kconfig(arch: &str) -> Option<()> { + println!("cargo:rerun-if-changed=config.toml"); + + assert!(Path::new("config.toml.example").try_exists().unwrap()); + if !Path::new("config.toml").try_exists().unwrap() { + std::fs::copy("config.toml.example", "config.toml").unwrap(); + } + let config_str = std::fs::read_to_string("config.toml").unwrap(); + let root: Table = toml::from_str(&config_str).unwrap(); + + let altfeatures = root + .get("arch")? + .as_table() + .unwrap() + .get(arch)? + .as_table() + .unwrap() + .get("features")? + .as_table() + .unwrap(); + + #[expect(clippy::format_collect)] // TODO: remove once version is bumped + let features_list = altfeatures + .keys() + .map(|feat| format!(", {feat:?}")) + .collect::(); + println!("cargo::rustc-check-cfg=cfg(cpu_feature_always, values(\"\"{features_list}))"); + println!("cargo::rustc-check-cfg=cfg(cpu_feature_auto, values(\"\"{features_list}))"); + println!("cargo::rustc-check-cfg=cfg(cpu_feature_never, values(\"\"{features_list}))"); + + let self_modifying = env::var("CARGO_FEATURE_SELF_MODIFYING").is_ok(); + + for (name, value) in altfeatures { + let mut choice = value.as_str().unwrap(); + assert!(matches!(choice, "always" | "never" | "auto")); + + if !self_modifying && choice == "auto" { + choice = "never"; + } + + println!("cargo:rustc-cfg=cpu_feature_{choice}=\"{name}\""); + } + + Some(()) +} + +fn main() { + println!("cargo::rustc-env=TARGET={}", env::var("TARGET").unwrap()); + println!("cargo::rustc-check-cfg=cfg(dtb)"); + + let out_dir = env::var("OUT_DIR").unwrap(); + let arch_str = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); + + match &*arch_str { + "aarch64" => { + println!("cargo::rustc-cfg=dtb"); + } + "x86" => { + println!("cargo::rerun-if-changed=src/asm/x86/trampoline.asm"); + + let status = Command::new("nasm") + .arg("-f") + .arg("bin") + .arg("-o") + .arg(format!("{}/trampoline", out_dir)) + .arg("src/asm/x86/trampoline.asm") + .status() + .expect("failed to run nasm"); + if !status.success() { + panic!("nasm failed with exit status {}", status); + } + } + "x86_64" => { + println!("cargo::rerun-if-changed=src/asm/x86_64/trampoline.asm"); + + let status = Command::new("nasm") + .arg("-f") + .arg("bin") + .arg("-o") + .arg(format!("{}/trampoline", out_dir)) + .arg("src/asm/x86_64/trampoline.asm") + .status() + .expect("failed to run nasm"); + if !status.success() { + panic!("nasm failed with exit status {}", status); + } + } + "riscv64" => { + println!("cargo::rustc-cfg=dtb"); + } + _ => (), + } + + let _ = parse_kconfig(&arch_str); +} diff --git a/clippy.sh b/clippy.sh new file mode 100755 index 0000000000..9528dfcf27 --- /dev/null +++ b/clippy.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +set -e + +export RUST_TARGET_PATH="${PWD}/targets" +export RUSTFLAGS="-C debuginfo=2" +cargo clippy --lib --release --target x86_64-unknown-none "$@" diff --git a/config.toml.example b/config.toml.example new file mode 100644 index 0000000000..b3cdb9e7fb --- /dev/null +++ b/config.toml.example @@ -0,0 +1,7 @@ +[arch.x86_64.features] +smap = "auto" +fsgsbase = "auto" +xsave = "auto" +xsaveopt = "auto" + +# vim: ft=toml diff --git a/linkers/aarch64.ld b/linkers/aarch64.ld new file mode 100644 index 0000000000..c64181cb24 --- /dev/null +++ b/linkers/aarch64.ld @@ -0,0 +1,55 @@ +ENTRY(kstart) +OUTPUT_FORMAT("elf64-littleaarch64", "elf64-littleaarch64", "elf64-littleaarch64") + +KERNEL_OFFSET = 0xFFFFFF0000000000; + +SECTIONS { + . = KERNEL_OFFSET; + + . += SIZEOF_HEADERS; + + /* Force the zero page to be part of a segment by creating a + * dummy section in the zero page. + * Limine will map the segment with the lowest vaddr value at + * 0xFFFFFFFF80000000 even if the segment has a higher vaddr. + * As such without the zero page being part of a segment, the + * kernel would be loaded at an offset from the expected + * location. As the redox kernel is not currently relocatable, + * this would result in a crash. A similar issue likely exists + * with multiboot/multiboot2 and the paddr of the segment. + */ + .dummy ALIGN(8) : AT(ADDR(.dummy) - KERNEL_OFFSET) {} + + . = ALIGN(4096); + + .text : AT(ADDR(.text) - KERNEL_OFFSET) { + __text_start = .; + *(.text*) + . = ALIGN(4096); + __text_end = .; + } + + .rodata : AT(ADDR(.rodata) - KERNEL_OFFSET) { + __rodata_start = .; + *(.rodata*) + . = ALIGN(4096); + __rodata_end = .; + } + + .data : AT(ADDR(.data) - KERNEL_OFFSET) { + *(.data*) + . = ALIGN(4096); + *(.bss*) + . = ALIGN(4096); + } + + __end = .; + + /DISCARD/ : { + *(.comment*) + *(.eh_frame*) + *(.gcc_except_table*) + *(.note*) + *(.rel.eh_frame*) + } +} diff --git a/linkers/i586.ld b/linkers/i586.ld new file mode 100644 index 0000000000..69ae7ee2e4 --- /dev/null +++ b/linkers/i586.ld @@ -0,0 +1,51 @@ +ENTRY(kstart) +OUTPUT_FORMAT(elf32-i386) + +KERNEL_OFFSET = 0xC0000000; + +SECTIONS { + . = KERNEL_OFFSET; + + . += SIZEOF_HEADERS; + + /* Force the zero page to be part of a segment by creating a + * dummy section in the zero page. + * Limine will map the segment with the lowest vaddr value at + * 0xFFFFFFFF80000000 even if the segment has a higher vaddr. + * As such without the zero page being part of a segment, the + * kernel would be loaded at an offset from the expected + * location. As the redox kernel is not currently relocatable, + * this would result in a crash. A similar issue likely exists + * with multiboot/multiboot2 and the paddr of the segment. + */ + .dummy : AT(ADDR(.dummy) - KERNEL_OFFSET) {} + + .text ALIGN(4K) : AT(ADDR(.text) - KERNEL_OFFSET) { + __text_start = .; + *(.text*) + } + + .rodata ALIGN(4K) : AT(ADDR(.rodata) - KERNEL_OFFSET) { + __text_end = .; + __rodata_start = .; + *(.rodata*) + } + + .data ALIGN(4K) : AT(ADDR(.data) - KERNEL_OFFSET) { + __rodata_end = .; + *(.data*) + . = ALIGN(4K); + *(.bss*) + . = ALIGN(4K); + } + + __end = .; + + /DISCARD/ : { + *(.comment*) + *(.eh_frame*) + *(.gcc_except_table*) + *(.note*) + *(.rel.eh_frame*) + } +} diff --git a/linkers/riscv64.ld b/linkers/riscv64.ld new file mode 100644 index 0000000000..7935190f2b --- /dev/null +++ b/linkers/riscv64.ld @@ -0,0 +1,61 @@ +ENTRY(kstart) +OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv", "elf64-littleriscv" ) + +KERNEL_OFFSET = 0xFFFFFFFF80000000; + +SECTIONS { + . = KERNEL_OFFSET; + + . += SIZEOF_HEADERS; + + /* Force the zero page to be part of a segment by creating a + * dummy section in the zero page. + * Linker will map the segment with the lowest vaddr value at + * 0xFFFFFFFF80000000 even if the segment has a higher vaddr. + * As such without the zero page being part of a segment, the + * kernel would be loaded at an offset from the expected + * location. As the redox kernel is not currently relocatable, + * this would result in a crash. A similar issue likely exists + * with multiboot/multiboot2 and the paddr of the segment. + */ + .dummy ALIGN(8) : AT(ADDR(.dummy) - KERNEL_OFFSET) {} + + . = ALIGN(4096); + + .text : AT(ADDR(.text) - KERNEL_OFFSET) { + __text_start = .; + *(.early_init.text*) + . = ALIGN(4096); + *(.text*) + . = ALIGN(4096); + __text_end = .; + } + + .rodata : AT(ADDR(.rodata) - KERNEL_OFFSET) { + __rodata_start = .; + *(.rodata*) + . = ALIGN(4096); + __rodata_end = .; + } + + .data : AT(ADDR(.data) - KERNEL_OFFSET) { + *(.data*) + *(.sdata*) + . = ALIGN(4096); + *(.got*) + . = ALIGN(4096); + *(.bss*) + *(.sbss*) + . = ALIGN(4096); + } + + __end = .; + + /DISCARD/ : { + *(.comment*) + *(.eh_frame*) + *(.gcc_except_table*) + *(.note*) + *(.rel.eh_frame*) + } +} diff --git a/linkers/x86_64.ld b/linkers/x86_64.ld new file mode 100644 index 0000000000..570b08fba6 --- /dev/null +++ b/linkers/x86_64.ld @@ -0,0 +1,60 @@ +ENTRY(kstart) +OUTPUT_FORMAT(elf64-x86-64) + +KERNEL_OFFSET = 0xFFFFFFFF80000000; + +SECTIONS { + . = KERNEL_OFFSET; + + . += SIZEOF_HEADERS; + + /* Force the zero page to be part of a segment by creating a + * dummy section in the zero page. + * Limine will map the segment with the lowest vaddr value at + * 0xFFFFFFFF80000000 even if the segment has a higher vaddr. + * As such without the zero page being part of a segment, the + * kernel would be loaded at an offset from the expected + * location. As the redox kernel is not currently relocatable, + * this would result in a crash. A similar issue likely exists + * with multiboot/multiboot2 and the paddr of the segment. + */ + .dummy : AT(ADDR(.dummy) - KERNEL_OFFSET) {} + + .text ALIGN(4K) : AT(ADDR(.text) - KERNEL_OFFSET) { + __text_start = .; + *(.text*) + } + + .rodata ALIGN(4K) : AT(ADDR(.rodata) - KERNEL_OFFSET) { + __text_end = .; + __rodata_start = .; + *(.rodata*) + __altcode_start = .; + KEEP(*(.altcode*)) + __altcode_end = .; + . = ALIGN(8); + __altrelocs_start = .; + KEEP(*(.altrelocs*)) + __altrelocs_end = .; + __altfeatures_start = .; + KEEP(*(.altfeatures*)) + __altfeatures_end = .; + } + + .data ALIGN(4K) : AT(ADDR(.data) - KERNEL_OFFSET) { + __rodata_end = .; + *(.data*) + . = ALIGN(4K); + *(.bss*) + } + + __end = .; + + /DISCARD/ : { + *(.comment*) + *(.eh_frame*) + *(.gcc_except_table*) + *(.note*) + *(.rel.eh_frame*) + } +} diff --git a/res/unifont.font b/res/unifont.font new file mode 100644 index 0000000000000000000000000000000000000000..a00366d5f559bc6fb3c20b9ee9cf8320219013b5 GIT binary patch literal 4096 zcmZuz-D?zA6rWC)GAY|-7*o5IaUF+AS@X1;NO9vbom?i{*;UgxN~tkY-9jjRa+6Zb z2b(-pFa_}=Pqs*7>lPIHRtkA4F21-RBEIFLxpRb{dDYWjgR*p1=Wu=UIDyB@X|m1WVxyNTF$eHwfO{ z@?4@{JJ_>F4r;!$F(|f%2);AO{STA;J68w!)`wcYxw&)pyK@>JJ=6cW`jN&f+fcun z#^WQ8|N8t-jl+$tm*E4ACoVugE@-^;tZy&f(zw*Qxn8=YaRKmtg5A#Tb(dfVuye_H z%do^SEDK2=#+yQp<8*_dnZ zG8htfA%Gl&kktX~GyG*dzK5WnJ0|==W*Au(e)g)_?W`)e4k-}*95(`m0H_+k(+*2RDXUWGUOu~nrH8AdAJD>0Ic#a9H+^G698fx;2d zpJuEvAu#H4chX-ds&84L;15ByV8pSk2aA)Lix+2-SjwX-KBPa!=I3W;(W6# z>Dk%&`OF?;K~UsH7y2j;!Y~BJ`GVb(^(+Wq_$+XN2>J_4CTE%)UKXnfpO@hS%x+=k zEzdPvZF*TLi8?bw)gP$LTpMF<&t(F2hUJp^!AzZKAfNRU79TWJ@p%YjRm3Fq&50PA z>46khNZFV5fV5$1;}HFaWtA5*AeU5xJWc@Be<`fwLF9*~7}B31;?L^MiWQ+nu% z2_WV-)Dt#3=o@UC;E`8KJ-}iG30&_Wf%U33sPhC zsi%)`#^n57I5= z;grV5?>_SAeW>=L@32Ff_c#8&HcHO#=X&>6^Izf?%(ATwQm1_wM!Xi1q23CBI~d ztw72;P7owP;5bwlc1c*RqFxi<^v{kz`6*dwqC}l-_j=n8dcvMqM46^tF6%pk{0QOl z+Xd{7FkIhsJx}{bZ2GR0HDH}(ulk>_Up9X(T8D4pGK`7$aftmI<@8>FC>yw5wLk50=5a6Ej#kEaFSc7o!!gy;ROK#WlT3hW^Cyw5`f z6c?dC6px*J6J!4QjvR6k|kqfB6^6f%g~8kCFQV#s~3=0rf)r6r+8LQJ-S;N3p1Y z0sFl=mf=-@%o7fqL!aReBC`jcPXOqD=3n0zWQC#0m_N=gUO|Or zMHR*l(fb1GQ;hl)qdvuB`C%WUg~9z{e)PV8_GJe-FXL14GCqYxeGm$JGG2vwPB4if wimV)FT}*&_1Ygh}*OvDw^hYuJqZsW|tmM`G!Sm{{hXmz{=Xqks"] +edition = "2024" + +[dependencies] +bitflags = "2" + +[features] +std = [] + +[[bin]] +name = "rmm" +path = "src/main.rs" +required-features = ["std"] diff --git a/rmm/README.md b/rmm/README.md new file mode 100644 index 0000000000..f475fba520 --- /dev/null +++ b/rmm/README.md @@ -0,0 +1,4 @@ +# Redox Memory Management + +This is a Rust crate to provide abstractions for hardware memory management. It +also contains a mechanism for testing memory management with software emulation. \ No newline at end of file diff --git a/rmm/src/allocator/frame/buddy.rs b/rmm/src/allocator/frame/buddy.rs new file mode 100644 index 0000000000..bd00c44f3a --- /dev/null +++ b/rmm/src/allocator/frame/buddy.rs @@ -0,0 +1,296 @@ +use core::{marker::PhantomData, mem}; + +use crate::{ + Arch, BumpAllocator, FrameAllocator, FrameCount, FrameUsage, PhysicalAddress, VirtualAddress, +}; + +#[repr(transparent)] +struct BuddyUsage(u8); + +#[repr(C, packed)] +struct BuddyEntry { + base: PhysicalAddress, + size: usize, + // Number of first free page + skip: usize, + // Count of used pages + used: usize, + phantom: PhantomData, +} + +impl Clone for BuddyEntry { + fn clone(&self) -> Self { + *self + } +} +impl Copy for BuddyEntry {} + +impl BuddyEntry { + fn empty() -> Self { + Self { + base: PhysicalAddress::new(0), + size: 0, + skip: 0, + used: 0, + phantom: PhantomData, + } + } + + #[inline(always)] + fn pages(&self) -> usize { + self.size >> A::PAGE_SHIFT + } + + fn usage_pages(&self) -> usize { + let bytes = self.pages() * mem::size_of::(); + // Round bytes used for usage to next page + (bytes + A::PAGE_OFFSET_MASK) >> A::PAGE_SHIFT + } + + unsafe fn usage_addr(&self, page: usize) -> Option { + if page < self.pages() { + let phys = self.base.add(page * mem::size_of::()); + Some(A::phys_to_virt(phys)) + } else { + None + } + } + + unsafe fn usage(&self, page: usize) -> Option { + unsafe { + let addr = self.usage_addr(page)?; + Some(A::read(addr)) + } + } + + #[expect(clippy::unit_arg)] + unsafe fn set_usage(&self, page: usize, usage: BuddyUsage) -> Option<()> { + unsafe { + let addr = self.usage_addr(page)?; + Some(A::write(addr, usage)) + } + } +} + +pub struct BuddyAllocator { + table_virt: VirtualAddress, + phantom: PhantomData, +} + +impl BuddyAllocator { + const BUDDY_ENTRIES: usize = A::PAGE_SIZE / mem::size_of::>(); + + pub unsafe fn new(mut bump_allocator: BumpAllocator) -> Option { + unsafe { + // Allocate buddy table + let table_phys = bump_allocator.allocate_one()?; + let table_virt = A::phys_to_virt(table_phys); + for i in 0..(A::PAGE_SIZE / mem::size_of::>()) { + let virt = table_virt.add(i * mem::size_of::>()); + A::write(virt, BuddyEntry::::empty()); + } + + let allocator = Self { + table_virt, + phantom: PhantomData, + }; + + // Add areas to buddy table, combining areas when possible, and skipping frames used + // by the bump allocator + let mut offset = bump_allocator.offset(); + for old_area in bump_allocator.areas().iter() { + let mut area = *old_area; + if offset >= area.size { + offset -= area.size; + continue; + } else if offset > 0 { + area.base = area.base.add(offset); + area.size -= offset; + offset = 0; + } + for i in 0..(A::PAGE_SIZE / mem::size_of::>()) { + let virt = table_virt.add(i * mem::size_of::>()); + let mut entry = A::read::>(virt); + let inserted = if area.base.add(area.size) == { entry.base } { + // Combine entry at start + entry.base = area.base; + entry.size += area.size; + true + } else if area.base == entry.base.add(entry.size) { + // Combine entry at end + entry.size += area.size; + true + } else if entry.size == 0 { + // Create new entry + entry.base = area.base; + entry.size = area.size; + true + } else { + false + }; + if inserted { + A::write(virt, entry); + break; + } + } + } + + //TODO: sort areas? + + // Allocate buddy maps + for i in 0..Self::BUDDY_ENTRIES { + let virt = table_virt.add(i * mem::size_of::>()); + let mut entry = A::read::>(virt); + + // Only set up entries that have enough space for their own usage map + let usage_pages = entry.usage_pages(); + if entry.pages() > usage_pages { + // Mark all usage bytes as unused + let usage_start = entry.usage_addr(0)?; + for page in 0..usage_pages { + A::write_bytes(usage_start.add(page << A::PAGE_SHIFT), 0, A::PAGE_SIZE); + } + + // Mark bytes used for usage as used + for page in 0..usage_pages { + entry.set_usage(page, BuddyUsage(1))?; + } + } + + // Skip the pages used for usage + entry.skip = usage_pages; + + // Set used pages to pages used for usage + entry.used = usage_pages; + + // Write updated entry + A::write(virt, entry); + } + + Some(allocator) + } + } +} + +unsafe impl FrameAllocator for BuddyAllocator { + fn allocate(&mut self, count: FrameCount) -> Option { + unsafe { + if self.table_virt.data() == 0 { + return None; + } + + for entry_i in 0..Self::BUDDY_ENTRIES { + let virt = self + .table_virt + .add(entry_i * mem::size_of::>()); + let mut entry = A::read::>(virt); + + let mut free_page = entry.skip; + let mut free_count = 0; + for page in entry.skip..entry.pages() { + let usage = entry.usage(page)?; + if usage.0 == 0 { + free_count += 1; + + if free_count == count.data() { + break; + } + } else { + free_page = page + 1; + free_count = 0; + } + } + + if free_count == count.data() { + for page in free_page..free_page + free_count { + // Update usage + let mut usage = entry.usage(page)?; + usage.0 += 1; + entry.set_usage(page, usage); + + // Zero page + let page_phys = entry.base.add(page << A::PAGE_SHIFT); + let page_virt = A::phys_to_virt(page_phys); + A::write_bytes(page_virt, 0, A::PAGE_SIZE); + } + + // Update skip if necessary + if entry.skip == free_page { + entry.skip = free_page + free_count; + } + + // Update used page count + entry.used += free_count; + + // Write updated entry + A::write(virt, entry); + + return Some(entry.base.add(free_page << A::PAGE_SHIFT)); + } + } + + None + } + } + + unsafe fn free(&mut self, base: PhysicalAddress, count: FrameCount) { + unsafe { + if self.table_virt.data() == 0 { + return; + } + + let size = count.data() * A::PAGE_SIZE; + for i in 0..Self::BUDDY_ENTRIES { + let virt = self.table_virt.add(i * mem::size_of::>()); + let mut entry = A::read::>(virt); + + if base >= { entry.base } && base.add(size) <= entry.base.add(entry.size) { + let start_page = (base.data() - { entry.base }.data()) >> A::PAGE_SHIFT; + for page in start_page..start_page + count.data() { + let mut usage = entry.usage(page).expect("failed to get usage during free"); + + if usage.0 > 0 { + usage.0 -= 1; + } else { + panic!("tried to free already free frame"); + } + + // If page was freed + if usage.0 == 0 { + // Update skip if necessary + if page < entry.skip { + entry.skip = page; + } + + // Update used page count + entry.used -= 1; + } + + entry + .set_usage(page, usage) + .expect("failed to set usage during free"); + } + + // Write updated entry + A::write(virt, entry); + + return; + } + } + } + } + + fn usage(&self) -> FrameUsage { + unsafe { + let mut total = 0; + let mut used = 0; + for i in 0..Self::BUDDY_ENTRIES { + let virt = self.table_virt.add(i * mem::size_of::>()); + let entry = A::read::>(virt); + total += entry.size >> A::PAGE_SHIFT; + used += entry.used; + } + FrameUsage::new(FrameCount::new(used), FrameCount::new(total)) + } + } +} diff --git a/rmm/src/allocator/frame/bump.rs b/rmm/src/allocator/frame/bump.rs new file mode 100644 index 0000000000..c0c1ade5b0 --- /dev/null +++ b/rmm/src/allocator/frame/bump.rs @@ -0,0 +1,79 @@ +use core::marker::PhantomData; + +use crate::{Arch, FrameAllocator, FrameCount, FrameUsage, MemoryArea, PhysicalAddress}; + +#[derive(Debug)] +pub struct BumpAllocator { + orig_areas: (&'static [MemoryArea], usize), + cur_areas: (&'static [MemoryArea], usize), + _marker: PhantomData A>, +} + +impl BumpAllocator { + pub fn new(mut areas: &'static [MemoryArea], mut offset: usize) -> Self { + while let Some(first) = areas.first() + && first.size <= offset + { + offset -= first.size; + areas = &areas[1..]; + } + + Self { + orig_areas: (areas, offset), + cur_areas: (areas, offset), + _marker: PhantomData, + } + } + pub fn areas(&self) -> &'static [MemoryArea] { + self.orig_areas.0 + } + /// Returns one semifree and the fully free areas. The offset is the number of bytes after + /// which the first area is free. + pub fn free_areas(&self) -> (&'static [MemoryArea], usize) { + self.cur_areas + } + pub fn abs_offset(&self) -> PhysicalAddress { + let (areas, off) = self.cur_areas; + areas + .first() + .map_or(PhysicalAddress::new(0), |a| a.base.add(off)) + } + pub fn offset(&self) -> usize { + (self.usage().total().data() - self.usage().free().data()) * A::PAGE_SIZE + } +} + +unsafe impl FrameAllocator for BumpAllocator { + fn allocate(&mut self, count: FrameCount) -> Option { + unsafe { + let req_size = count.data() * A::PAGE_SIZE; + + let block = loop { + let area = self.cur_areas.0.first()?; + let off = self.cur_areas.1; + if area.size - off < req_size { + self.cur_areas = (&self.cur_areas.0[1..], 0); + continue; + } + self.cur_areas.1 += req_size; + + break area.base.add(off); + }; + A::write_bytes(A::phys_to_virt(block), 0, req_size); + Some(block) + } + } + + unsafe fn free(&mut self, _address: PhysicalAddress, _count: FrameCount) { + unimplemented!("BumpAllocator::free not implemented"); + } + + fn usage(&self) -> FrameUsage { + let total = self.orig_areas.0.iter().map(|a| a.size).sum::() - self.orig_areas.1; + let free = self.cur_areas.0.iter().map(|a| a.size).sum::() - self.cur_areas.1; + FrameUsage::new( + FrameCount::new((total - free) / A::PAGE_SIZE), + FrameCount::new(total / A::PAGE_SIZE), + ) + } +} diff --git a/rmm/src/allocator/frame/mod.rs b/rmm/src/allocator/frame/mod.rs new file mode 100644 index 0000000000..2261f89841 --- /dev/null +++ b/rmm/src/allocator/frame/mod.rs @@ -0,0 +1,83 @@ +use crate::PhysicalAddress; + +pub use self::{buddy::*, bump::*}; + +mod buddy; +mod bump; + +#[derive(Clone, Copy, Debug)] +#[repr(transparent)] +pub struct FrameCount(usize); + +impl FrameCount { + pub fn new(count: usize) -> Self { + Self(count) + } + + pub fn data(&self) -> usize { + self.0 + } +} + +#[derive(Debug)] +pub struct FrameUsage { + used: FrameCount, + total: FrameCount, +} + +impl FrameUsage { + pub fn new(used: FrameCount, total: FrameCount) -> Self { + Self { used, total } + } + + pub fn used(&self) -> FrameCount { + self.used + } + + pub fn free(&self) -> FrameCount { + FrameCount(self.total.0 - self.used.0) + } + + pub fn total(&self) -> FrameCount { + self.total + } +} + +pub unsafe trait FrameAllocator { + fn allocate(&mut self, count: FrameCount) -> Option; + + unsafe fn free(&mut self, address: PhysicalAddress, count: FrameCount); + + fn allocate_one(&mut self) -> Option { + self.allocate(FrameCount::new(1)) + } + + unsafe fn free_one(&mut self, address: PhysicalAddress) { + unsafe { + self.free(address, FrameCount::new(1)); + } + } + + fn usage(&self) -> FrameUsage; +} + +unsafe impl FrameAllocator for &mut T +where + T: FrameAllocator, +{ + fn allocate(&mut self, count: FrameCount) -> Option { + T::allocate(self, count) + } + unsafe fn free(&mut self, address: PhysicalAddress, count: FrameCount) { + unsafe { T::free(self, address, count) } + } + fn allocate_one(&mut self) -> Option { + T::allocate_one(self) + } + unsafe fn free_one(&mut self, address: PhysicalAddress) { + unsafe { T::free_one(self, address) } + } + fn usage(&self) -> FrameUsage { + T::usage(self) + } +} diff --git a/rmm/src/allocator/mod.rs b/rmm/src/allocator/mod.rs new file mode 100644 index 0000000000..0027153038 --- /dev/null +++ b/rmm/src/allocator/mod.rs @@ -0,0 +1,3 @@ +pub use self::frame::*; + +mod frame; diff --git a/rmm/src/arch/aarch64.rs b/rmm/src/arch/aarch64.rs new file mode 100644 index 0000000000..425a6dd2b1 --- /dev/null +++ b/rmm/src/arch/aarch64.rs @@ -0,0 +1,153 @@ +use core::arch::asm; + +use crate::{Arch, PhysicalAddress, TableKind, VirtualAddress}; + +#[derive(Clone, Copy)] +pub struct AArch64Arch; + +impl Arch for AArch64Arch { + const KERNEL_SEPARATE_TABLE: bool = true; + + const PAGE_SHIFT: usize = 12; // 4096 bytes + const PAGE_ENTRY_SHIFT: usize = 9; // 512 entries, 8 bytes each + const PAGE_LEVELS: usize = 4; // L0, L1, L2, L3 + + //TODO + const ENTRY_ADDRESS_WIDTH: usize = 40; + const ENTRY_FLAG_DEFAULT_PAGE: usize = Self::ENTRY_FLAG_PRESENT + | 1 << 1 // Page flag + | 1 << 10 // Access flag + | Self::ENTRY_FLAG_NO_GLOBAL; + const ENTRY_FLAG_DEFAULT_TABLE: usize + = Self::ENTRY_FLAG_PRESENT + | Self::ENTRY_FLAG_READWRITE + | 1 << 1 // Table flag + | 1 << 10 // Access flag + ; + const ENTRY_FLAG_PRESENT: usize = 1 << 0; + const ENTRY_FLAG_READONLY: usize = 1 << 7; + const ENTRY_FLAG_READWRITE: usize = 0; + const ENTRY_FLAG_PAGE_USER: usize = 1 << 6; + // This sets both userspace and privileged execute never + //TODO: Separate the two? + const ENTRY_FLAG_NO_EXEC: usize = 0b11 << 53; + const ENTRY_FLAG_EXEC: usize = 0; + const ENTRY_FLAG_GLOBAL: usize = 0; + const ENTRY_FLAG_NO_GLOBAL: usize = 1 << 11; + const ENTRY_FLAG_DEVICE_MEMORY: usize = MEM_ATTR_DEVICE_nGnRnE << 2; + const ENTRY_FLAG_UNCACHEABLE: usize = MEM_ATTR_NC << 2; + const ENTRY_FLAG_WRITE_COMBINING: usize = MEM_ATTR_NC << 2; + + const PHYS_OFFSET: usize = 0xFFFF_8000_0000_0000; + + #[inline(always)] + fn invalidate(address: VirtualAddress) { + unsafe { + asm!(" + dsb ishst + tlbi vaae1is, {} + dsb ish + isb + ", in(reg) (address.data() >> Self::PAGE_SHIFT)); + } + } + + #[inline(always)] + fn invalidate_all() { + unsafe { + asm!( + " + dsb ishst + tlbi vmalle1is + dsb ish + isb + " + ); + } + } + + #[inline(always)] + fn table(table_kind: TableKind) -> PhysicalAddress { + let address: usize; + match table_kind { + TableKind::User => { + unsafe { asm!("mrs {0}, ttbr0_el1", out(reg) address) }; + } + TableKind::Kernel => { + unsafe { asm!("mrs {0}, ttbr1_el1", out(reg) address) }; + } + } + PhysicalAddress::new(address) + } + + #[inline(always)] + unsafe fn set_table(table_kind: TableKind, address: PhysicalAddress) { + unsafe { + match table_kind { + TableKind::User => { + asm!("msr ttbr0_el1, {0}", in(reg) address.data()); + } + TableKind::Kernel => { + asm!("msr ttbr1_el1, {0}", in(reg) address.data()); + } + } + Self::invalidate_all(); + } + } + + fn virt_is_valid(_address: VirtualAddress) -> bool { + //TODO: what makes an address valid on aarch64? + true + } +} + +#[cfg_attr(not(target_arch = "aarch64"), allow(unused))] +const MEM_ATTR_WB: usize = 0; +const MEM_ATTR_NC: usize = 1; +#[allow(non_upper_case_globals)] +const MEM_ATTR_DEVICE_nGnRnE: usize = 2; + +/// Setup Memory Access Indirection Register +#[cfg(target_arch = "aarch64")] +#[inline(always)] +pub unsafe fn init_mair() { + // https://github.com/freebsd/freebsd-src/blob/d15733065c4221dcd5bb3622d225760f271f6fc9/sys/arm64/include/armreg.h#L1986-L1991 + const fn mair_attr(attr: u64, idx: usize) -> u64 { + attr << (idx * 8) + } + #[allow(non_upper_case_globals)] + const MAIR_DEVICE_nGnRnE: u64 = 0x00; + #[allow(non_upper_case_globals)] + const _MAIR_DEVICE_nGnRE: u64 = 0x04; + const MAIR_NORMAL_NC: u64 = 0x44; + const _MAIR_NORMAL_WT: u64 = 0xbb; + const MAIR_NORMAL_WB: u64 = 0xff; + + unsafe { + let val: u64 = const { + mair_attr(MAIR_DEVICE_nGnRnE, MEM_ATTR_DEVICE_nGnRnE) + | mair_attr(MAIR_NORMAL_NC, MEM_ATTR_NC) + | mair_attr(MAIR_NORMAL_WB, MEM_ATTR_WB) + }; + + asm!("msr mair_el1, {}", in(reg) val); + } +} + +const _: () = { + assert!(AArch64Arch::PAGE_SIZE == 4096); + assert!(AArch64Arch::PAGE_OFFSET_MASK == 0xFFF); + assert!(AArch64Arch::PAGE_ADDRESS_SHIFT == 48); + assert!(AArch64Arch::PAGE_ADDRESS_SIZE == 0x0001_0000_0000_0000); + assert!(AArch64Arch::PAGE_ADDRESS_MASK == 0x0000_FFFF_FFFF_F000); + assert!(AArch64Arch::PAGE_ENTRY_SIZE == 8); + assert!(AArch64Arch::PAGE_ENTRIES == 512); + assert!(AArch64Arch::PAGE_ENTRY_MASK == 0x1FF); + assert!(AArch64Arch::PAGE_NEGATIVE_MASK == 0xFFFF_0000_0000_0000); + + assert!(AArch64Arch::ENTRY_ADDRESS_SIZE == 0x0000_0100_0000_0000); + assert!(AArch64Arch::ENTRY_ADDRESS_MASK == 0x0000_00FF_FFFF_FFFF); + assert!(AArch64Arch::ENTRY_FLAGS_MASK == 0xFFF0_0000_0000_0FFF); + + assert!(AArch64Arch::PHYS_OFFSET == 0xFFFF_8000_0000_0000); +}; diff --git a/rmm/src/arch/emulate.rs b/rmm/src/arch/emulate.rs new file mode 100644 index 0000000000..4d84ca2bac --- /dev/null +++ b/rmm/src/arch/emulate.rs @@ -0,0 +1,355 @@ +extern crate std; + +use std::{boxed::Box, collections::BTreeMap, marker::PhantomData, mem, ptr, sync::Mutex, vec}; + +use crate::{ + arch::x86_64::X8664Arch, page::PageFlags, Arch, MemoryArea, PageEntry, PhysicalAddress, + TableKind, VirtualAddress, MEGABYTE, +}; + +#[derive(Clone, Copy)] +pub struct EmulateArch; + +impl EmulateArch { + pub unsafe fn init() -> &'static [MemoryArea] { + unsafe { + // Create machine with PAGE_ENTRIES pages offset mapped (2 MiB on x86_64) + let mut machine = Machine::new(MEMORY_SIZE); + + // PML4 index 256 (PHYS_OFFSET) link to PDP + let pml4 = 0; + let pdp = pml4 + Self::PAGE_SIZE; + let flags = Self::ENTRY_FLAG_READWRITE | Self::ENTRY_FLAG_PRESENT; + machine.write_phys::( + PhysicalAddress::new(pml4 + 256 * Self::PAGE_ENTRY_SIZE), + pdp | flags, + ); + + // PDP link to PD + let pd = pdp + Self::PAGE_SIZE; + machine.write_phys::(PhysicalAddress::new(pdp), pd | flags); + + // PD link to PT + let pt = pd + Self::PAGE_SIZE; + machine.write_phys::(PhysicalAddress::new(pd), pt | flags); + + // PT links to frames + for i in 0..Self::PAGE_ENTRIES { + let page = i * Self::PAGE_SIZE; + machine.write_phys::( + PhysicalAddress::new(pt + i * Self::PAGE_ENTRY_SIZE), + page | flags, + ); + } + + *MACHINE.lock().unwrap() = Some(machine); + + // Set table to pml4 + EmulateArch::set_table(TableKind::Kernel, PhysicalAddress::new(pml4)); + + &MEMORY_AREAS + } + } +} + +impl Arch for EmulateArch { + const KERNEL_SEPARATE_TABLE: bool = false; + + const PAGE_SHIFT: usize = X8664Arch::PAGE_SHIFT; + const PAGE_ENTRY_SHIFT: usize = X8664Arch::PAGE_ENTRY_SHIFT; + const PAGE_LEVELS: usize = X8664Arch::PAGE_LEVELS; + + const ENTRY_ADDRESS_SHIFT: usize = X8664Arch::ENTRY_ADDRESS_SHIFT; + const ENTRY_FLAG_DEFAULT_PAGE: usize = X8664Arch::ENTRY_FLAG_DEFAULT_PAGE; + const ENTRY_FLAG_DEFAULT_TABLE: usize = X8664Arch::ENTRY_FLAG_DEFAULT_TABLE; + const ENTRY_FLAG_PRESENT: usize = X8664Arch::ENTRY_FLAG_PRESENT; + const ENTRY_FLAG_READONLY: usize = X8664Arch::ENTRY_FLAG_READONLY; + const ENTRY_FLAG_READWRITE: usize = X8664Arch::ENTRY_FLAG_READWRITE; + const ENTRY_FLAG_PAGE_USER: usize = X8664Arch::ENTRY_FLAG_PAGE_USER; + const ENTRY_FLAG_NO_EXEC: usize = X8664Arch::ENTRY_FLAG_NO_EXEC; + const ENTRY_FLAG_EXEC: usize = X8664Arch::ENTRY_FLAG_EXEC; + + const PHYS_OFFSET: usize = X8664Arch::PHYS_OFFSET; + + const ENTRY_FLAG_GLOBAL: usize = X8664Arch::ENTRY_FLAG_GLOBAL; + const ENTRY_FLAG_NO_GLOBAL: usize = X8664Arch::ENTRY_FLAG_NO_GLOBAL; + + const ENTRY_ADDRESS_WIDTH: usize = X8664Arch::ENTRY_ADDRESS_WIDTH; + + const ENTRY_FLAG_DEVICE_MEMORY: usize = X8664Arch::ENTRY_FLAG_DEVICE_MEMORY; + const ENTRY_FLAG_UNCACHEABLE: usize = X8664Arch::ENTRY_FLAG_UNCACHEABLE; + const ENTRY_FLAG_WRITE_COMBINING: usize = X8664Arch::ENTRY_FLAG_WRITE_COMBINING; + + #[inline(always)] + unsafe fn read(address: VirtualAddress) -> T { + MACHINE.lock().unwrap().as_ref().unwrap().read(address) + } + + #[inline(always)] + unsafe fn write(address: VirtualAddress, value: T) { + MACHINE + .lock() + .unwrap() + .as_mut() + .unwrap() + .write(address, value) + } + + #[inline(always)] + unsafe fn write_bytes(address: VirtualAddress, value: u8, count: usize) { + MACHINE + .lock() + .unwrap() + .as_mut() + .unwrap() + .write_bytes(address, value, count) + } + + #[inline(always)] + fn invalidate(address: VirtualAddress) { + MACHINE + .lock() + .unwrap() + .as_mut() + .unwrap() + .invalidate(address); + } + + #[inline(always)] + fn invalidate_all() { + MACHINE.lock().unwrap().as_mut().unwrap().invalidate_all(); + } + + #[inline(always)] + fn table(_table_kind: TableKind) -> PhysicalAddress { + MACHINE.lock().unwrap().as_mut().unwrap().get_table() + } + + #[inline(always)] + unsafe fn set_table(_table_kind: TableKind, address: PhysicalAddress) { + MACHINE.lock().unwrap().as_mut().unwrap().set_table(address); + } + fn virt_is_valid(_address: VirtualAddress) -> bool { + // TODO: Don't see why an emulated arch would have any problems with canonicalness... + true + } +} + +const MEMORY_SIZE: usize = 64 * MEGABYTE; +static MEMORY_AREAS: [MemoryArea; 2] = [ + MemoryArea { + base: PhysicalAddress::new(EmulateArch::PAGE_SIZE * 4), // Initial PML4, PDP, PD, and PT wasted + size: MEMORY_SIZE / 2 - EmulateArch::PAGE_SIZE * 4, + }, + // Second area for debugging + MemoryArea { + base: PhysicalAddress::new(MEMORY_SIZE / 2), + size: MEMORY_SIZE / 2, + }, +]; + +static MACHINE: Mutex>> = Mutex::new(None); + +struct Machine { + memory: Box<[u8]>, + map: BTreeMap>, + table_addr: PhysicalAddress, + phantom: PhantomData, +} + +impl Machine { + fn new(memory_size: usize) -> Self { + Self { + memory: vec![0; memory_size].into_boxed_slice(), + map: BTreeMap::new(), + table_addr: PhysicalAddress::new(0), + phantom: PhantomData, + } + } + + fn read_phys(&self, phys: PhysicalAddress) -> T { + let size = mem::size_of::(); + if phys.add(size).data() <= self.memory.len() { + unsafe { ptr::read(self.memory.as_ptr().add(phys.data()) as *const T) } + } else { + panic!( + "read_phys: 0x{:X} size 0x{:X} outside of memory", + phys.data(), + size + ); + } + } + + fn write_phys(&mut self, phys: PhysicalAddress, value: T) { + let size = mem::size_of::(); + if phys.add(size).data() <= self.memory.len() { + unsafe { + ptr::write(self.memory.as_mut_ptr().add(phys.data()) as *mut T, value); + } + } else { + panic!( + "write_phys: 0x{:X} size 0x{:X} outside of memory", + phys.data(), + size + ); + } + } + + fn write_phys_bytes(&mut self, phys: PhysicalAddress, value: u8, count: usize) { + if phys.add(count).data() <= self.memory.len() { + unsafe { + ptr::write_bytes(self.memory.as_mut_ptr().add(phys.data()), value, count); + } + } else { + panic!( + "write_phys_bytes: 0x{:X} count 0x{:X} outside of memory", + phys.data(), + count + ); + } + } + + fn translate(&self, virt: VirtualAddress) -> Option<(PhysicalAddress, PageFlags)> { + let virt_data = virt.data(); + let page = virt_data & A::PAGE_ADDRESS_MASK; + let offset = virt_data & A::PAGE_OFFSET_MASK; + let entry = self.map.get(&VirtualAddress::new(page))?; + Some((entry.address().ok()?.add(offset), entry.flags())) + } + + fn read(&self, virt: VirtualAddress) -> T { + //TODO: allow reading past page boundaries + let virt_data = virt.data(); + let size = mem::size_of::(); + if (virt_data & A::PAGE_ADDRESS_MASK) != ((virt_data + (size - 1)) & A::PAGE_ADDRESS_MASK) { + panic!( + "read: 0x{:X} size 0x{:X} passes page boundary", + virt_data, size + ); + } + + if let Some((phys, _flags)) = self.translate(virt) { + self.read_phys(phys) + } else { + panic!("read: 0x{:X} size 0x{:X} not present", virt_data, size); + } + } + + fn write(&mut self, virt: VirtualAddress, value: T) { + //TODO: allow writing past page boundaries + let virt_data = virt.data(); + let size = mem::size_of::(); + if (virt_data & A::PAGE_ADDRESS_MASK) != ((virt_data + (size - 1)) & A::PAGE_ADDRESS_MASK) { + panic!( + "write: 0x{:X} size 0x{:X} passes page boundary", + virt_data, size + ); + } + + if let Some((phys, flags)) = self.translate(virt) { + if flags.has_write() { + self.write_phys(phys, value); + } else { + panic!("write: 0x{:X} size 0x{:X} not writable", virt_data, size); + } + } else { + panic!("write: 0x{:X} size 0x{:X} not present", virt_data, size); + } + } + + fn write_bytes(&mut self, virt: VirtualAddress, value: u8, count: usize) { + //TODO: allow writing past page boundaries + let virt_data = virt.data(); + if (virt_data & A::PAGE_ADDRESS_MASK) != ((virt_data + (count - 1)) & A::PAGE_ADDRESS_MASK) + { + panic!( + "write_bytes: 0x{:X} count 0x{:X} passes page boundary", + virt_data, count + ); + } + + if let Some((phys, flags)) = self.translate(virt) { + if flags.has_write() { + self.write_phys_bytes(phys, value, count); + } else { + panic!( + "write_bytes: 0x{:X} count 0x{:X} not writable", + virt_data, count + ); + } + } else { + panic!( + "write_bytes: 0x{:X} count 0x{:X} not present", + virt_data, count + ); + } + } + + fn invalidate(&mut self, _address: VirtualAddress) { + unimplemented!("EmulateArch::invalidate not implemented"); + } + + //TODO: cleanup + fn invalidate_all(&mut self) { + self.map.clear(); + + // PML4 + let a4 = self.table_addr.data(); + for i4 in 0..A::PAGE_ENTRIES { + let e3 = self.read_phys::(PhysicalAddress::new(a4 + i4 * A::PAGE_ENTRY_SIZE)); + let f3 = e3 & A::ENTRY_FLAGS_MASK; + if f3 & A::ENTRY_FLAG_PRESENT == 0 { + continue; + } + + // Page directory pointer + let a3 = ((e3 >> A::ENTRY_ADDRESS_SHIFT) & A::ENTRY_ADDRESS_MASK) << A::PAGE_SHIFT; + for i3 in 0..A::PAGE_ENTRIES { + let e2 = + self.read_phys::(PhysicalAddress::new(a3 + i3 * A::PAGE_ENTRY_SIZE)); + let f2 = e2 & A::ENTRY_FLAGS_MASK; + if f2 & A::ENTRY_FLAG_PRESENT == 0 { + continue; + } + + // Page directory + let a2 = ((e2 >> A::ENTRY_ADDRESS_SHIFT) & A::ENTRY_ADDRESS_MASK) << A::PAGE_SHIFT; + for i2 in 0..A::PAGE_ENTRIES { + let e1 = + self.read_phys::(PhysicalAddress::new(a2 + i2 * A::PAGE_ENTRY_SIZE)); + let f1 = e1 & A::ENTRY_FLAGS_MASK; + if f1 & A::ENTRY_FLAG_PRESENT == 0 { + continue; + } + + // Page table + let a1 = + ((e1 >> A::ENTRY_ADDRESS_SHIFT) & A::ENTRY_ADDRESS_MASK) << A::PAGE_SHIFT; + for i1 in 0..A::PAGE_ENTRIES { + let e = self + .read_phys::(PhysicalAddress::new(a1 + i1 * A::PAGE_ENTRY_SIZE)); + let f = e & A::ENTRY_FLAGS_MASK; + if f & A::ENTRY_FLAG_PRESENT == 0 { + continue; + } + + // Page + let page = (i4 << 39) | (i3 << 30) | (i2 << 21) | (i1 << 12); + //println!("map 0x{:X} to 0x{:X}, 0x{:X}", page, a, f); + self.map + .insert(VirtualAddress::new(page), PageEntry::from_data(e)); + } + } + } + } + } + + fn get_table(&self) -> PhysicalAddress { + self.table_addr + } + + fn set_table(&mut self, address: PhysicalAddress) { + self.table_addr = address; + self.invalidate_all(); + } +} diff --git a/rmm/src/arch/mod.rs b/rmm/src/arch/mod.rs new file mode 100644 index 0000000000..e08377714f --- /dev/null +++ b/rmm/src/arch/mod.rs @@ -0,0 +1,93 @@ +use core::ptr; + +use crate::{PhysicalAddress, TableKind, VirtualAddress}; + +//TODO: Support having all page tables compile on all architectures +#[cfg(target_pointer_width = "64")] +pub mod aarch64; +#[cfg(all(feature = "std", target_pointer_width = "64"))] +pub mod emulate; +#[cfg(target_pointer_width = "64")] +pub mod riscv64; +#[cfg(target_pointer_width = "32")] +pub mod x86; +#[cfg(target_pointer_width = "64")] +pub mod x86_64; +mod x86_shared; + +pub trait Arch: Clone + Copy { + /// Does the architecture use a separate page table for the kernel. + /// + /// If false, the page table entries corresponding to the top half of the + /// address space will be copied into the top level of every page table + /// and will never be unmapped when unmapping pages. + const KERNEL_SEPARATE_TABLE: bool; + + const PAGE_SHIFT: usize; + const PAGE_ENTRY_SHIFT: usize; + const PAGE_LEVELS: usize; + + const ENTRY_ADDRESS_WIDTH: usize; // Number of bits of physical address in PTE + const ENTRY_ADDRESS_SHIFT: usize = Self::PAGE_SHIFT; // Offset of physical address in PTE + const ENTRY_FLAG_DEFAULT_PAGE: usize; + const ENTRY_FLAG_DEFAULT_TABLE: usize; + const ENTRY_FLAG_PRESENT: usize; + const ENTRY_FLAG_READONLY: usize; + const ENTRY_FLAG_READWRITE: usize; + const ENTRY_FLAG_PAGE_USER: usize; // Leaf table user page flag + const ENTRY_FLAG_TABLE_USER: usize = Self::ENTRY_FLAG_PAGE_USER; // Directory user page table flag + const ENTRY_FLAG_NO_EXEC: usize; + const ENTRY_FLAG_EXEC: usize; + const ENTRY_FLAG_GLOBAL: usize; + const ENTRY_FLAG_NO_GLOBAL: usize; + const ENTRY_FLAG_DEVICE_MEMORY: usize; + const ENTRY_FLAG_UNCACHEABLE: usize; + const ENTRY_FLAG_WRITE_COMBINING: usize; + + const PHYS_OFFSET: usize; + + const PAGE_SIZE: usize = 1 << Self::PAGE_SHIFT; + const PAGE_OFFSET_MASK: usize = Self::PAGE_SIZE - 1; + const PAGE_ADDRESS_SHIFT: usize = Self::PAGE_LEVELS * Self::PAGE_ENTRY_SHIFT + Self::PAGE_SHIFT; + const PAGE_ADDRESS_SIZE: u64 = 1 << (Self::PAGE_ADDRESS_SHIFT as u64); + const PAGE_ADDRESS_MASK: usize = (Self::PAGE_ADDRESS_SIZE - (Self::PAGE_SIZE as u64)) as usize; + const PAGE_ENTRY_SIZE: usize = 1 << (Self::PAGE_SHIFT - Self::PAGE_ENTRY_SHIFT); + const PAGE_ENTRIES: usize = 1 << Self::PAGE_ENTRY_SHIFT; + const PAGE_ENTRY_MASK: usize = Self::PAGE_ENTRIES - 1; + const PAGE_NEGATIVE_MASK: usize = !(Self::PAGE_ADDRESS_SIZE - 1) as usize; + + const ENTRY_ADDRESS_SIZE: usize = 1 << Self::ENTRY_ADDRESS_WIDTH; // size of addressable physical memory, in pages + const ENTRY_ADDRESS_MASK: usize = Self::ENTRY_ADDRESS_SIZE - 1; // Mask of physical address, starting at 0th bit + const ENTRY_FLAGS_MASK: usize = !(Self::ENTRY_ADDRESS_MASK << Self::ENTRY_ADDRESS_SHIFT); + + #[inline(always)] + unsafe fn read(address: VirtualAddress) -> T { + unsafe { ptr::read(address.data() as *const T) } + } + + #[inline(always)] + unsafe fn write(address: VirtualAddress, value: T) { + unsafe { ptr::write(address.data() as *mut T, value) } + } + + #[inline(always)] + unsafe fn write_bytes(address: VirtualAddress, value: u8, count: usize) { + unsafe { ptr::write_bytes(address.data() as *mut u8, value, count) } + } + + fn invalidate(address: VirtualAddress); + fn invalidate_all(); + + fn table(table_kind: TableKind) -> PhysicalAddress; + unsafe fn set_table(table_kind: TableKind, address: PhysicalAddress); + + #[inline(always)] + fn phys_to_virt(phys: PhysicalAddress) -> VirtualAddress { + match phys.data().checked_add(Self::PHYS_OFFSET) { + Some(some) => VirtualAddress::new(some), + None => panic!("phys_to_virt({:#x}) overflow", phys.data()), + } + } + + fn virt_is_valid(address: VirtualAddress) -> bool; +} diff --git a/rmm/src/arch/riscv64/mod.rs b/rmm/src/arch/riscv64/mod.rs new file mode 100644 index 0000000000..b12daec511 --- /dev/null +++ b/rmm/src/arch/riscv64/mod.rs @@ -0,0 +1,7 @@ +pub use sv39::RiscV64Sv39Arch; +pub use sv48::RiscV64Sv48Arch; +pub use sv57::RiscV64Sv57Arch; + +mod sv39; +mod sv48; +mod sv57; diff --git a/rmm/src/arch/riscv64/sv39.rs b/rmm/src/arch/riscv64/sv39.rs new file mode 100644 index 0000000000..8bb5083944 --- /dev/null +++ b/rmm/src/arch/riscv64/sv39.rs @@ -0,0 +1,124 @@ +use core::arch::asm; + +use crate::{Arch, PhysicalAddress, TableKind, VirtualAddress}; + +#[derive(Clone, Copy)] +pub struct RiscV64Sv39Arch; + +pub const ACCESSED: usize = 1 << 6; +pub const DIRTY: usize = 1 << 7; + +impl Arch for RiscV64Sv39Arch { + const KERNEL_SEPARATE_TABLE: bool = false; + + const PAGE_SHIFT: usize = 12; // 4096 bytes + const PAGE_ENTRY_SHIFT: usize = 9; // 512 entries, 8 bytes each + const PAGE_LEVELS: usize = 3; // L0, L1, L2 + + const ENTRY_ADDRESS_WIDTH: usize = 44; + const ENTRY_ADDRESS_SHIFT: usize = 10; + + const ENTRY_FLAG_DEFAULT_PAGE: usize = + Self::ENTRY_FLAG_PRESENT | Self::ENTRY_FLAG_READONLY | ACCESSED | DIRTY; + const ENTRY_FLAG_DEFAULT_TABLE: usize = Self::ENTRY_FLAG_PRESENT; + const ENTRY_FLAG_PRESENT: usize = 1 << 0; + const ENTRY_FLAG_READONLY: usize = 1 << 1; + const ENTRY_FLAG_READWRITE: usize = 3 << 1; + const ENTRY_FLAG_PAGE_USER: usize = 1 << 4; + const ENTRY_FLAG_TABLE_USER: usize = 0; + const ENTRY_FLAG_NO_EXEC: usize = 0; + const ENTRY_FLAG_EXEC: usize = 1 << 3; + const ENTRY_FLAG_GLOBAL: usize = 1 << 5; + const ENTRY_FLAG_NO_GLOBAL: usize = 0; + const ENTRY_FLAG_DEVICE_MEMORY: usize = 0; // FIXME use Svpbmt + const ENTRY_FLAG_UNCACHEABLE: usize = 0; // FIXME use Svpbmt + const ENTRY_FLAG_WRITE_COMBINING: usize = 0; // FIXME use Svpbmt + + const PHYS_OFFSET: usize = 0xFFFF_FFC0_0000_0000; + + #[inline(always)] + fn invalidate(address: VirtualAddress) { + unsafe { asm!("sfence.vma {}", in(reg) address.data()) }; + } + + #[inline(always)] + fn invalidate_all() { + unsafe { asm!("sfence.vma") }; + } + + #[inline(always)] + fn table(_table_kind: TableKind) -> PhysicalAddress { + let satp: usize; + unsafe { asm!("csrr {0}, satp", out(reg) satp) }; + PhysicalAddress::new( + (satp & Self::ENTRY_ADDRESS_MASK) << Self::PAGE_SHIFT, // Convert from PPN + ) + } + + #[inline(always)] + unsafe fn set_table(_table_kind: TableKind, address: PhysicalAddress) { + let satp = (8 << 60) | // Sv39 MODE + (address.data() >> Self::PAGE_SHIFT); // Convert to PPN (TODO: ensure alignment) + unsafe { + asm!("csrw satp, {0}", in(reg) satp); + Self::invalidate_all(); + } + } + + fn virt_is_valid(address: VirtualAddress) -> bool { + let mask = !((Self::PAGE_ADDRESS_SIZE as usize - 1) >> 1); + let masked = address.data() & mask; + + masked == mask || masked == 0 + } +} + +const _: () = { + assert!(RiscV64Sv39Arch::PAGE_SIZE == 4096); + assert!(RiscV64Sv39Arch::PAGE_OFFSET_MASK == 0xFFF); + assert!(RiscV64Sv39Arch::PAGE_ADDRESS_SHIFT == 39); + assert!(RiscV64Sv39Arch::PAGE_ADDRESS_SIZE == 0x0000_0080_0000_0000); + assert!(RiscV64Sv39Arch::PAGE_ADDRESS_MASK == 0x0000_007F_FFFF_F000); + assert!(RiscV64Sv39Arch::PAGE_ENTRY_SIZE == 8); + assert!(RiscV64Sv39Arch::PAGE_ENTRIES == 512); + assert!(RiscV64Sv39Arch::PAGE_ENTRY_MASK == 0x1FF); + assert!(RiscV64Sv39Arch::PAGE_NEGATIVE_MASK == 0xFFFF_FF80_0000_0000); + + assert!(RiscV64Sv39Arch::ENTRY_ADDRESS_SIZE == 0x0000_1000_0000_0000); + assert!(RiscV64Sv39Arch::ENTRY_ADDRESS_MASK == 0x0000_0FFF_FFFF_FFFF); + assert!(RiscV64Sv39Arch::ENTRY_FLAGS_MASK == 0xFFC0_0000_0000_03FF); + + assert!(RiscV64Sv39Arch::PHYS_OFFSET == 0xFFFF_FFC0_0000_0000); +}; + +#[cfg(test)] +mod tests { + use super::RiscV64Sv39Arch; + use crate::Arch; + + #[test] + fn is_canonical() { + use super::VirtualAddress; + + #[track_caller] + fn yes(addr: usize) { + assert!(RiscV64Sv39Arch::virt_is_valid(VirtualAddress::new(addr))); + } + #[track_caller] + fn no(addr: usize) { + assert!(!RiscV64Sv39Arch::virt_is_valid(VirtualAddress::new(addr))); + } + + yes(0xFFFF_FFFF_FFFF_FFFF); + yes(0xFFFF_FFF0_1337_1337); + no(0x0000_0F00_0000_0000); + no(0x1337_0000_0000_0000); + no(1 << 38); + yes(1 << 37); + + // Check for off-by-one errors. + yes(0xFFFF_FFC0_0000_0000 | (1 << 37)); + yes(0xFFFF_FFE0_0000_0000 | (1 << 37)); + no(0xFFFF_FF80_0000_0000 | (1 << 37)); + } +} diff --git a/rmm/src/arch/riscv64/sv48.rs b/rmm/src/arch/riscv64/sv48.rs new file mode 100644 index 0000000000..fd421d0e44 --- /dev/null +++ b/rmm/src/arch/riscv64/sv48.rs @@ -0,0 +1,118 @@ +use core::arch::asm; + +use crate::{Arch, PhysicalAddress, TableKind, VirtualAddress}; + +#[derive(Clone, Copy)] +pub struct RiscV64Sv48Arch; + +impl Arch for RiscV64Sv48Arch { + const KERNEL_SEPARATE_TABLE: bool = false; + + const PAGE_SHIFT: usize = 12; // 4096 bytes + const PAGE_ENTRY_SHIFT: usize = 9; // 512 entries, 8 bytes each + const PAGE_LEVELS: usize = 4; // L0, L1, L2, L3 + + const ENTRY_ADDRESS_WIDTH: usize = 44; + const ENTRY_ADDRESS_SHIFT: usize = 10; + + const ENTRY_FLAG_DEFAULT_PAGE: usize = Self::ENTRY_FLAG_PRESENT | Self::ENTRY_FLAG_READONLY; + const ENTRY_FLAG_DEFAULT_TABLE: usize = Self::ENTRY_FLAG_PRESENT; + const ENTRY_FLAG_PRESENT: usize = 1 << 0; + const ENTRY_FLAG_READONLY: usize = 1 << 1; + const ENTRY_FLAG_READWRITE: usize = 3 << 1; + const ENTRY_FLAG_PAGE_USER: usize = 1 << 4; + const ENTRY_FLAG_TABLE_USER: usize = 0; + const ENTRY_FLAG_NO_EXEC: usize = 0; + const ENTRY_FLAG_EXEC: usize = 1 << 3; + const ENTRY_FLAG_GLOBAL: usize = 1 << 5; + const ENTRY_FLAG_NO_GLOBAL: usize = 0; + const ENTRY_FLAG_DEVICE_MEMORY: usize = 0; // FIXME use Svpbmt + const ENTRY_FLAG_UNCACHEABLE: usize = 0; // FIXME use Svpbmt + const ENTRY_FLAG_WRITE_COMBINING: usize = 0; // FIXME use Svpbmt + + const PHYS_OFFSET: usize = 0xFFFF_8000_0000_0000; + + #[inline(always)] + fn invalidate(address: VirtualAddress) { + unsafe { asm!("sfence.vma {}", in(reg) address.data()) }; + } + + #[inline(always)] + fn invalidate_all() { + unsafe { asm!("sfence.vma") }; + } + + #[inline(always)] + fn table(_table_kind: TableKind) -> PhysicalAddress { + let satp: usize; + unsafe { asm!("csrr {0}, satp", out(reg) satp) }; + PhysicalAddress::new( + (satp & Self::ENTRY_ADDRESS_MASK) << Self::PAGE_SHIFT, // Convert from PPN + ) + } + + #[inline(always)] + unsafe fn set_table(_table_kind: TableKind, address: PhysicalAddress) { + let satp = (9 << 60) | // Sv48 MODE + (address.data() >> Self::PAGE_SHIFT); // Convert to PPN (TODO: ensure alignment) + unsafe { + asm!("csrw satp, {0}", in(reg) satp); + Self::invalidate_all(); + } + } + + fn virt_is_valid(address: VirtualAddress) -> bool { + // RISC-V SV48 uses 48-bit sign-extended addresses, identical to 4-level paging on x86_64. + let mask = !((Self::PAGE_ADDRESS_SIZE as usize - 1) >> 1); + let masked = address.data() & mask; + + masked == mask || masked == 0 + } +} + +const _: () = { + assert!(RiscV64Sv48Arch::PAGE_SIZE == 4096); + assert!(RiscV64Sv48Arch::PAGE_OFFSET_MASK == 0xFFF); + assert!(RiscV64Sv48Arch::PAGE_ADDRESS_SHIFT == 48); + assert!(RiscV64Sv48Arch::PAGE_ADDRESS_SIZE == 0x0001_0000_0000_0000); + assert!(RiscV64Sv48Arch::PAGE_ADDRESS_MASK == 0x0000_FFFF_FFFF_F000); + assert!(RiscV64Sv48Arch::PAGE_ENTRY_SIZE == 8); + assert!(RiscV64Sv48Arch::PAGE_ENTRIES == 512); + assert!(RiscV64Sv48Arch::PAGE_ENTRY_MASK == 0x1FF); + assert!(RiscV64Sv48Arch::PAGE_NEGATIVE_MASK == 0xFFFF_0000_0000_0000); + + assert!(RiscV64Sv48Arch::ENTRY_ADDRESS_SIZE == 0x0000_1000_0000_0000); + assert!(RiscV64Sv48Arch::ENTRY_ADDRESS_MASK == 0x0000_0FFF_FFFF_FFFF); + assert!(RiscV64Sv48Arch::ENTRY_FLAGS_MASK == 0xFFC0_0000_0000_03FF); + + assert!(RiscV64Sv48Arch::PHYS_OFFSET == 0xFFFF_8000_0000_0000); +}; + +#[cfg(test)] +mod tests { + use super::RiscV64Sv48Arch; + use crate::Arch; + + #[test] + fn is_canonical() { + use super::VirtualAddress; + + // Close to identical when compared to x86_64 test. + fn yes(address: usize) { + assert!(RiscV64Sv48Arch::virt_is_valid(VirtualAddress::new(address))); + } + fn no(address: usize) { + assert!(!RiscV64Sv48Arch::virt_is_valid(VirtualAddress::new( + address + ))); + } + + yes(0xFFFF_8000_1337_1337); + yes(0xFFFF_FFFF_FFFF_FFFF); + yes(0x0000_0000_0000_0042); + yes(0x0000_7FFF_FFFF_FFFF); + no(0x1337_0000_0000_0000); + no(0x1337_8000_0000_0000); + no(0x0000_8000_0000_0000); + } +} diff --git a/rmm/src/arch/riscv64/sv57.rs b/rmm/src/arch/riscv64/sv57.rs new file mode 100644 index 0000000000..c54ffa4a05 --- /dev/null +++ b/rmm/src/arch/riscv64/sv57.rs @@ -0,0 +1,116 @@ +use core::arch::asm; + +use crate::{Arch, PhysicalAddress, TableKind, VirtualAddress}; + +#[derive(Clone, Copy)] +pub struct RiscV64Sv57Arch; + +impl Arch for RiscV64Sv57Arch { + const KERNEL_SEPARATE_TABLE: bool = false; + + const PAGE_SHIFT: usize = 12; // 4096 bytes + const PAGE_ENTRY_SHIFT: usize = 9; // 512 entries, 8 bytes each + const PAGE_LEVELS: usize = 5; // L0, L1, L2, L3, L4 + + const ENTRY_ADDRESS_WIDTH: usize = 44; + const ENTRY_ADDRESS_SHIFT: usize = 10; + + const ENTRY_FLAG_DEFAULT_PAGE: usize = Self::ENTRY_FLAG_PRESENT | Self::ENTRY_FLAG_READONLY; + const ENTRY_FLAG_DEFAULT_TABLE: usize = Self::ENTRY_FLAG_PRESENT; + const ENTRY_FLAG_PRESENT: usize = 1 << 0; + const ENTRY_FLAG_READONLY: usize = 1 << 1; + const ENTRY_FLAG_READWRITE: usize = 3 << 1; + const ENTRY_FLAG_PAGE_USER: usize = 1 << 4; + const ENTRY_FLAG_TABLE_USER: usize = 0; + const ENTRY_FLAG_NO_EXEC: usize = 0; + const ENTRY_FLAG_EXEC: usize = 1 << 3; + const ENTRY_FLAG_GLOBAL: usize = 1 << 5; + const ENTRY_FLAG_NO_GLOBAL: usize = 0; + const ENTRY_FLAG_DEVICE_MEMORY: usize = 0; // FIXME use Svpbmt + const ENTRY_FLAG_UNCACHEABLE: usize = 0; // FIXME use Svpbmt + const ENTRY_FLAG_WRITE_COMBINING: usize = 0; // FIXME use Svpbmt + + const PHYS_OFFSET: usize = 0xFF00_0000_0000_0000; + + #[inline(always)] + fn invalidate(address: VirtualAddress) { + unsafe { asm!("sfence.vma {}", in(reg) address.data()) }; + } + + #[inline(always)] + fn invalidate_all() { + unsafe { asm!("sfence.vma") }; + } + + #[inline(always)] + fn table(_table_kind: TableKind) -> PhysicalAddress { + let satp: usize; + unsafe { asm!("csrr {0}, satp", out(reg) satp) }; + PhysicalAddress::new( + (satp & Self::ENTRY_ADDRESS_MASK) << Self::PAGE_SHIFT, // Convert from PPN + ) + } + + #[inline(always)] + unsafe fn set_table(_table_kind: TableKind, address: PhysicalAddress) { + let satp = (10 << 60) | // Sv57 MODE + (address.data() >> Self::PAGE_SHIFT); // Convert to PPN (TODO: ensure alignment) + unsafe { + asm!("csrw satp, {0}", in(reg) satp); + Self::invalidate_all(); + } + } + + fn virt_is_valid(address: VirtualAddress) -> bool { + let mask = !((Self::PAGE_ADDRESS_SIZE as usize - 1) >> 1); + let masked = address.data() & mask; + + masked == mask || masked == 0 + } +} + +const _: () = { + assert!(RiscV64Sv57Arch::PAGE_SIZE == 4096); + assert!(RiscV64Sv57Arch::PAGE_OFFSET_MASK == 0xFFF); + assert!(RiscV64Sv57Arch::PAGE_ADDRESS_SHIFT == 57); + assert!(RiscV64Sv57Arch::PAGE_ADDRESS_SIZE == 0x0200_0000_0000_0000); + assert!(RiscV64Sv57Arch::PAGE_ADDRESS_MASK == 0x01FF_FFFF_FFFF_F000); + assert!(RiscV64Sv57Arch::PAGE_ENTRY_SIZE == 8); + assert!(RiscV64Sv57Arch::PAGE_ENTRIES == 512); + assert!(RiscV64Sv57Arch::PAGE_ENTRY_MASK == 0x1FF); + assert!(RiscV64Sv57Arch::PAGE_NEGATIVE_MASK == 0xFE00_0000_0000_0000); + + assert!(RiscV64Sv57Arch::ENTRY_ADDRESS_SIZE == 0x0000_1000_0000_0000); + assert!(RiscV64Sv57Arch::ENTRY_ADDRESS_MASK == 0x0000_0FFF_FFFF_FFFF); + assert!(RiscV64Sv57Arch::ENTRY_FLAGS_MASK == 0xFFC0_0000_0000_03FF); + + assert!(RiscV64Sv57Arch::PHYS_OFFSET == 0xFF00_0000_0000_0000); +}; + +#[cfg(test)] +mod tests { + use super::RiscV64Sv57Arch; + use crate::Arch; + + #[test] + fn is_canonical() { + use super::VirtualAddress; + + fn yes(address: usize) { + assert!(RiscV64Sv57Arch::virt_is_valid(VirtualAddress::new(address))); + } + fn no(address: usize) { + assert!(!RiscV64Sv57Arch::virt_is_valid(VirtualAddress::new( + address + ))); + } + + yes(0xFF00_0000_1337_1337); + yes(0xFFFF_FFFF_FFFF_FFFF); + yes(0x0000_0000_0000_0042); + yes(0x00FF_FFFF_FFFF_FFFF); + no(0x1337_0000_0000_0000); + no(0x1337_8000_0000_0000); + no(0x0F00_0000_0000_0000); + } +} diff --git a/rmm/src/arch/x86.rs b/rmm/src/arch/x86.rs new file mode 100644 index 0000000000..ade8dd2ccc --- /dev/null +++ b/rmm/src/arch/x86.rs @@ -0,0 +1,80 @@ +//TODO: USE PAE +use core::arch::asm; + +use crate::{Arch, PhysicalAddress, TableKind, VirtualAddress}; + +#[derive(Clone, Copy)] +pub struct X86Arch; + +impl Arch for X86Arch { + const KERNEL_SEPARATE_TABLE: bool = false; + + const PAGE_SHIFT: usize = 12; // 4096 bytes + const PAGE_ENTRY_SHIFT: usize = 10; // 1024 entries, 4 bytes each + const PAGE_LEVELS: usize = 2; // PD, PT + + const ENTRY_ADDRESS_WIDTH: usize = 20; + const ENTRY_FLAG_DEFAULT_PAGE: usize = Self::ENTRY_FLAG_PRESENT; + const ENTRY_FLAG_DEFAULT_TABLE: usize = Self::ENTRY_FLAG_PRESENT | Self::ENTRY_FLAG_READWRITE; + const ENTRY_FLAG_PRESENT: usize = 1 << 0; + const ENTRY_FLAG_READONLY: usize = 0; + const ENTRY_FLAG_READWRITE: usize = 1 << 1; + const ENTRY_FLAG_PAGE_USER: usize = 1 << 2; + // Not used: const ENTRY_FLAG_HUGE: usize = 1 << 7; + const ENTRY_FLAG_GLOBAL: usize = 1 << 8; + const ENTRY_FLAG_NO_GLOBAL: usize = 0; + const ENTRY_FLAG_NO_EXEC: usize = 0; // NOT AVAILABLE UNLESS PAE IS USED! + const ENTRY_FLAG_EXEC: usize = 0; + const ENTRY_FLAG_DEVICE_MEMORY: usize = PAT_UC_; + const ENTRY_FLAG_UNCACHEABLE: usize = PAT_UC_; + const ENTRY_FLAG_WRITE_COMBINING: usize = PAT_WC; + + const PHYS_OFFSET: usize = 0x8000_0000; + + #[inline(always)] + fn invalidate(address: VirtualAddress) { + unsafe { asm!("invlpg [{0}]", in(reg) address.data()) }; + } + + #[inline(always)] + fn invalidate_all() { + unsafe { Self::set_table(TableKind::User, Self::table(TableKind::User)) }; + } + + #[inline(always)] + fn table(_table_kind: TableKind) -> PhysicalAddress { + let address: usize; + unsafe { asm!("mov {0}, cr3", out(reg) address) }; + PhysicalAddress::new(address) + } + + #[inline(always)] + unsafe fn set_table(_table_kind: TableKind, address: PhysicalAddress) { + unsafe { asm!("mov cr3, {0}", in(reg) address.data()) }; + } + + fn virt_is_valid(_address: VirtualAddress) -> bool { + // On 32-bit x86, every virtual address is valid + true + } +} + +pub use super::x86_shared::*; + +const _: () = { + assert!(X86Arch::PAGE_SIZE == 4096); + assert!(X86Arch::PAGE_OFFSET_MASK == 0xFFF); + assert!(X86Arch::PAGE_ADDRESS_SHIFT == 32); + assert!(X86Arch::PAGE_ADDRESS_SIZE == 0x0000_0001_0000_0000); + assert!(X86Arch::PAGE_ADDRESS_MASK == 0xFFFF_F000); + assert!(X86Arch::PAGE_ENTRY_SIZE == 4); + assert!(X86Arch::PAGE_ENTRIES == 1024); + assert!(X86Arch::PAGE_ENTRY_MASK == 0x3FF); + assert!(X86Arch::PAGE_NEGATIVE_MASK == 0x0000_0000_0000); + + assert!(X86Arch::ENTRY_ADDRESS_SIZE == 0x0000_0000_0010_0000); + assert!(X86Arch::ENTRY_ADDRESS_MASK == 0x000F_FFFF); + assert!(X86Arch::ENTRY_FLAGS_MASK == 0x0000_0FFF); + + assert!(X86Arch::PHYS_OFFSET == 0x8000_0000); +}; diff --git a/rmm/src/arch/x86_64.rs b/rmm/src/arch/x86_64.rs new file mode 100644 index 0000000000..7c52163a73 --- /dev/null +++ b/rmm/src/arch/x86_64.rs @@ -0,0 +1,107 @@ +use core::arch::asm; + +use crate::{Arch, PhysicalAddress, TableKind, VirtualAddress}; + +#[derive(Clone, Copy, Debug)] +pub struct X8664Arch; + +impl Arch for X8664Arch { + const KERNEL_SEPARATE_TABLE: bool = false; + + const PAGE_SHIFT: usize = 12; // 4096 bytes + const PAGE_ENTRY_SHIFT: usize = 9; // 512 entries, 8 bytes each + const PAGE_LEVELS: usize = 4; // PML4, PDP, PD, PT + + const ENTRY_ADDRESS_WIDTH: usize = 40; + const ENTRY_FLAG_DEFAULT_PAGE: usize = Self::ENTRY_FLAG_PRESENT; + const ENTRY_FLAG_DEFAULT_TABLE: usize = Self::ENTRY_FLAG_PRESENT | Self::ENTRY_FLAG_READWRITE; + const ENTRY_FLAG_PRESENT: usize = 1 << 0; + const ENTRY_FLAG_READONLY: usize = 0; + const ENTRY_FLAG_READWRITE: usize = 1 << 1; + const ENTRY_FLAG_PAGE_USER: usize = 1 << 2; + // Not used: const ENTRY_FLAG_HUGE: usize = 1 << 7; + const ENTRY_FLAG_GLOBAL: usize = 1 << 8; + const ENTRY_FLAG_NO_GLOBAL: usize = 0; + const ENTRY_FLAG_NO_EXEC: usize = 1 << 63; + const ENTRY_FLAG_EXEC: usize = 0; + const ENTRY_FLAG_DEVICE_MEMORY: usize = PAT_UC_; + const ENTRY_FLAG_UNCACHEABLE: usize = PAT_UC_; + const ENTRY_FLAG_WRITE_COMBINING: usize = PAT_WC; + + const PHYS_OFFSET: usize = Self::PAGE_NEGATIVE_MASK + (Self::PAGE_ADDRESS_SIZE >> 1) as usize; // PML4 slot 256 and onwards + + #[inline(always)] + fn invalidate(address: VirtualAddress) { + unsafe { asm!("invlpg [{0}]", in(reg) address.data()) }; + } + + #[inline(always)] + fn invalidate_all() { + unsafe { Self::set_table(TableKind::User, Self::table(TableKind::User)) }; + } + + #[inline(always)] + fn table(_table_kind: TableKind) -> PhysicalAddress { + let address: usize; + unsafe { asm!("mov {0}, cr3", out(reg) address) }; + PhysicalAddress::new(address) + } + + #[inline(always)] + unsafe fn set_table(_table_kind: TableKind, address: PhysicalAddress) { + unsafe { asm!("mov cr3, {0}", in(reg) address.data()) }; + } + + fn virt_is_valid(address: VirtualAddress) -> bool { + // On x86_64, an address is valid if and only if it is canonical. It may still point to + // unmapped memory, but will always be valid once translated via the page table has + // suceeded. + let masked = address.data() & 0xFFFF_8000_0000_0000; + // TODO: 5-level paging + masked == 0xFFFF_8000_0000_0000 || masked == 0 + } +} + +pub use super::x86_shared::*; + +const _: () = { + assert!(X8664Arch::PAGE_SIZE == 4096); + assert!(X8664Arch::PAGE_OFFSET_MASK == 0xFFF); + assert!(X8664Arch::PAGE_ADDRESS_SHIFT == 48); + assert!(X8664Arch::PAGE_ADDRESS_SIZE == 0x0001_0000_0000_0000); + assert!(X8664Arch::PAGE_ADDRESS_MASK == 0x0000_FFFF_FFFF_F000); + assert!(X8664Arch::PAGE_ENTRY_SIZE == 8); + assert!(X8664Arch::PAGE_ENTRIES == 512); + assert!(X8664Arch::PAGE_ENTRY_MASK == 0x1FF); + assert!(X8664Arch::PAGE_NEGATIVE_MASK == 0xFFFF_0000_0000_0000); + + assert!(X8664Arch::ENTRY_ADDRESS_SIZE == 0x0000_0100_0000_0000); + assert!(X8664Arch::ENTRY_ADDRESS_MASK == 0x0000_00FF_FFFF_FFFF); + assert!(X8664Arch::ENTRY_FLAGS_MASK == 0xFFF0_0000_0000_0FFF); + + assert!(X8664Arch::PHYS_OFFSET == 0xFFFF_8000_0000_0000); +}; + +#[cfg(test)] +mod tests { + use super::{VirtualAddress, X8664Arch}; + use crate::Arch; + + #[test] + fn is_canonical() { + fn yes(address: usize) { + assert!(X8664Arch::virt_is_valid(VirtualAddress::new(address))); + } + fn no(address: usize) { + assert!(!X8664Arch::virt_is_valid(VirtualAddress::new(address))); + } + + yes(0xFFFF_8000_1337_1337); + yes(0xFFFF_FFFF_FFFF_FFFF); + yes(0x0000_0000_0000_0042); + yes(0x0000_7FFF_FFFF_FFFF); + no(0x1337_0000_0000_0000); + no(0x1337_8000_0000_0000); + no(0x0000_8000_0000_0000); + } +} diff --git a/rmm/src/arch/x86_shared.rs b/rmm/src/arch/x86_shared.rs new file mode 100644 index 0000000000..a55a9fd9e3 --- /dev/null +++ b/rmm/src/arch/x86_shared.rs @@ -0,0 +1,37 @@ +#![expect(clippy::identity_op)] + +// Page attribute table is indexed by PAT(7) PCD(4) PWT(3) +pub(crate) const _PAT_WB: usize = (0b0 << 7) + (0b00 << 3); +pub(crate) const _PAT_WT: usize = (0b0 << 7) + (0b01 << 3); +pub(crate) const PAT_UC_: usize = (0b0 << 7) + (0b10 << 3); // UC- +pub(crate) const _PAT_UC: usize = (0b0 << 7) + (0b11 << 3); // UC +pub(crate) const PAT_WC: usize = (0b1 << 7) + (0b00 << 3); + +/// Setup page attribute table +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub unsafe fn init_pat() { + unsafe { + let uncacheable = 0; // UC + let write_combining = 1; // WC + let write_through = 4; // WT + let _write_protected = 5; // WP + let write_back = 6; // WB + let uncached = 7; // UC- (overridable by WC MTRR) + + let pat0 = write_back; + let pat1 = write_through; + let pat2 = uncached; + let pat3 = uncacheable; + + let pat4 = write_combining; + let pat5 = pat1; + let pat6 = pat2; + let pat7 = pat3; + + let msr = 631; // IA32_PAT + let low = u32::from_be_bytes([pat3, pat2, pat1, pat0]); + let high = u32::from_be_bytes([pat7, pat6, pat5, pat4]); + core::arch::asm!("wrmsr", in("ecx") msr, in("eax") low, in("edx") high); + } +} diff --git a/rmm/src/lib.rs b/rmm/src/lib.rs new file mode 100644 index 0000000000..3088ec989e --- /dev/null +++ b/rmm/src/lib.rs @@ -0,0 +1,97 @@ +#![no_std] +#![allow(clippy::new_without_default)] + +pub use crate::{allocator::*, arch::*, page::*}; + +mod allocator; +mod arch; +mod page; + +pub const KILOBYTE: usize = 1024; +pub const MEGABYTE: usize = KILOBYTE * 1024; +pub const GIGABYTE: usize = MEGABYTE * 1024; +#[cfg(target_pointer_width = "64")] +pub const TERABYTE: usize = GIGABYTE * 1024; + +/// Specific table to be used, needed on some architectures +//TODO: Use this throughout the code +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] +pub enum TableKind { + /// Userspace page table + User, + /// Kernel page table + Kernel, +} + +/// Physical memory address +#[derive(Clone, Copy, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[repr(transparent)] +pub struct PhysicalAddress(usize); + +impl PhysicalAddress { + #[inline(always)] + pub const fn new(address: usize) -> Self { + Self(address) + } + + #[inline(always)] + pub fn data(&self) -> usize { + self.0 + } + + #[expect(clippy::should_implement_trait)] + #[inline(always)] + pub fn add(self, offset: usize) -> Self { + Self(self.0 + offset) + } +} + +impl core::fmt::Debug for PhysicalAddress { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "[phys {:#0x}]", self.data()) + } +} + +/// Virtual memory address +#[derive(Clone, Copy, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[repr(transparent)] +pub struct VirtualAddress(usize); + +impl VirtualAddress { + #[inline(always)] + pub const fn new(address: usize) -> Self { + Self(address) + } + + #[inline(always)] + pub fn data(&self) -> usize { + self.0 + } + + #[expect(clippy::should_implement_trait)] + #[inline(always)] + pub fn add(self, offset: usize) -> Self { + Self(self.0 + offset) + } + + #[inline(always)] + pub fn kind(&self) -> TableKind { + if (self.0 as isize) < 0 { + TableKind::Kernel + } else { + TableKind::User + } + } +} + +impl core::fmt::Debug for VirtualAddress { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "[virt {:#0x}]", self.data()) + } +} + +#[derive(Clone, Copy, Debug)] +pub struct MemoryArea { + pub base: PhysicalAddress, + pub size: usize, +} diff --git a/rmm/src/main.rs b/rmm/src/main.rs new file mode 100644 index 0000000000..e2419dc2fc --- /dev/null +++ b/rmm/src/main.rs @@ -0,0 +1,309 @@ +#![cfg(target_pointer_width = "64")] + +use rmm::{ + emulate::EmulateArch, Arch, BuddyAllocator, BumpAllocator, Flusher, FrameAllocator, FrameCount, + MemoryArea, PageFlags, PageFlushAll, PageMapper, PageTable, PhysicalAddress, TableKind, + VirtualAddress, GIGABYTE, KILOBYTE, MEGABYTE, TERABYTE, +}; +use std::marker::PhantomData; + +pub fn format_size(size: usize) -> String { + if size >= 2 * TERABYTE { + format!("{} TB", size / TERABYTE) + } else if size >= 2 * GIGABYTE { + format!("{} GB", size / GIGABYTE) + } else if size >= 2 * MEGABYTE { + format!("{} MB", size / MEGABYTE) + } else if size >= 2 * KILOBYTE { + format!("{} KB", size / KILOBYTE) + } else { + format!("{} B", size) + } +} + +#[allow(dead_code)] +unsafe fn dump_tables(table: PageTable) { + unsafe { + let level = table.level(); + for i in 0..A::PAGE_ENTRIES { + if level == 0 { + if let Some(entry) = table.entry(i) { + if entry.present() { + let base = table.entry_base(i).unwrap(); + println!( + "0x{:X}: 0x{:X}", + base.data(), + entry.address().unwrap().data() + ); + } + } + } else { + if let Some(next) = table.next(i) { + dump_tables(next); + } + } + } + } +} + +pub struct SlabNode { + next: PhysicalAddress, + count: usize, + phantom: PhantomData, +} + +impl SlabNode { + pub fn new(next: PhysicalAddress, count: usize) -> Self { + Self { + next, + count, + phantom: PhantomData, + } + } + + pub fn empty() -> Self { + Self::new(PhysicalAddress::new(0), 0) + } + + pub unsafe fn insert(&mut self, phys: PhysicalAddress) { + unsafe { + let virt = A::phys_to_virt(phys); + A::write(virt, self.next); + self.next = phys; + self.count += 1; + } + } + + pub unsafe fn remove(&mut self) -> Option { + unsafe { + if self.count > 0 { + let phys = self.next; + let virt = A::phys_to_virt(phys); + self.next = A::read(virt); + self.count -= 1; + Some(phys) + } else { + None + } + } + } +} + +pub struct SlabAllocator { + //TODO: Allow allocations up to maximum pageable size + nodes: [SlabNode; 4], + phantom: PhantomData, +} + +impl SlabAllocator { + pub unsafe fn new(areas: &'static [MemoryArea], offset: usize) -> Self { + unsafe { + let mut allocator = Self { + nodes: [ + SlabNode::empty(), + SlabNode::empty(), + SlabNode::empty(), + SlabNode::empty(), + ], + phantom: PhantomData, + }; + + // Add unused areas to free lists + let mut area_offset = offset; + for area in areas.iter() { + if area_offset < area.size { + let area_base = area.base.add(area_offset); + let area_size = area.size - area_offset; + allocator.free(area_base, area_size); + area_offset = 0; + } else { + area_offset -= area.size; + } + } + + allocator + } + } + + pub unsafe fn allocate(&mut self, size: usize) -> Option { + unsafe { + for level in 0..A::PAGE_LEVELS - 1 { + let level_shift = level * A::PAGE_ENTRY_SHIFT + A::PAGE_SHIFT; + let level_size = 1 << level_shift; + if size <= level_size { + if let Some(base) = self.nodes[level].remove() { + self.free(base.add(size), level_size - size); + return Some(base); + } + } + } + None + } + } + + //TODO: This causes fragmentation, since neighbors are not identified + //TODO: remainders less than PAGE_SIZE will be lost + pub unsafe fn free(&mut self, mut base: PhysicalAddress, mut size: usize) { + unsafe { + for level in (0..A::PAGE_LEVELS - 1).rev() { + let level_shift = level * A::PAGE_ENTRY_SHIFT + A::PAGE_SHIFT; + let level_size = 1 << level_shift; + while size >= level_size { + println!("Add {:X} {}", base.data(), format_size(level_size)); + self.nodes[level].insert(base); + base = base.add(level_size); + size -= level_size; + } + } + } + } + + pub unsafe fn remaining(&mut self) -> usize { + let mut remaining = 0; + for level in (0..A::PAGE_LEVELS - 1).rev() { + let level_shift = level * A::PAGE_ENTRY_SHIFT + A::PAGE_SHIFT; + let level_size = 1 << level_shift; + remaining += self.nodes[level].count * level_size; + } + remaining + } +} + +unsafe fn new_tables(areas: &'static [MemoryArea]) { + unsafe { + // First, calculate how much memory we have + let mut size = 0; + for area in areas.iter() { + size += area.size; + } + + println!("Memory: {}", format_size(size)); + + // Create a basic allocator for the first pages + let mut bump_allocator = BumpAllocator::::new(areas, 0); + + { + // Map all physical areas at PHYS_OFFSET + let mut mapper = PageMapper::::create(TableKind::Kernel, &mut bump_allocator) + .expect("failed to create Mapper"); + for area in areas.iter() { + for i in 0..area.size / A::PAGE_SIZE { + let phys = area.base.add(i * A::PAGE_SIZE); + let (_, flush) = mapper + .map_linearly(phys, PageFlags::::new().write(true)) + .expect("failed to map page to frame"); + flush.ignore(); // Not the active table + } + } + + // Use the new table + mapper.make_current(); + } + + // Create the physical memory map + let offset = bump_allocator.offset(); + println!("Permanently used: {}", format_size(offset)); + + let mut allocator = BuddyAllocator::::new(bump_allocator).unwrap(); + + for i in 0..16 { + { + let phys_opt = allocator.allocate_one(); + println!("page {}: {:X?}", i, phys_opt); + if i % 3 == 0 { + if let Some(phys) = phys_opt { + println!("free {}: {:X?}", i, phys_opt); + allocator.free_one(phys); + } + } + } + + { + let phys_opt = allocator.allocate(FrameCount::new(16)); + println!("page*16 {}: {:X?}", i, phys_opt); + if i % 2 == 0 { + if let Some(phys) = phys_opt { + println!("free*16 {}: {:X?}", i, phys_opt); + allocator.free(phys, FrameCount::new(16)); + } + } + } + } + + let mut mapper = PageMapper::::current(TableKind::Kernel, &mut allocator); + let mut flush_all = PageFlushAll::new(); + for i in 0..16 { + let virt = VirtualAddress::new(MEGABYTE + i * A::PAGE_SIZE); + let phys = mapper + .allocator_mut() + .allocate_one() + .expect("failed to map page"); + let flush = mapper + .map_phys(virt, phys, PageFlags::::new().user(true).write(true)) + .expect("failed to map page"); + flush_all.consume(flush); + } + flush_all.flush(); + + let mut flush_all = PageFlushAll::new(); + for i in 0..16 { + let virt = VirtualAddress::new(MEGABYTE + i * A::PAGE_SIZE); + let (old, _, flush) = mapper.unmap_phys(virt).expect("failed to unmap page"); + mapper.allocator_mut().free_one(old); + flush_all.consume(flush); + } + flush_all.flush(); + + let usage = allocator.usage(); + println!("Allocator usage:"); + println!( + " Used: {}", + format_size(usage.used().data() * A::PAGE_SIZE) + ); + println!( + " Free: {}", + format_size(usage.free().data() * A::PAGE_SIZE) + ); + println!( + " Total: {}", + format_size(usage.total().data() * A::PAGE_SIZE) + ); + } +} + +fn main() { + unsafe { + let areas = EmulateArch::init(); + + // Debug table + //dump_tables(PageTable::::top()); + + new_tables::(areas); + + //dump_tables(PageTable::::top()); + + for i in &[1, 2, 4, 8, 16, 32] { + let phys = PhysicalAddress::new(i * MEGABYTE); + let virt = EmulateArch::phys_to_virt(phys); + + // Test read + println!( + "0x{:X} (0x{:X}) = 0x{:X}", + virt.data(), + phys.data(), + EmulateArch::read::(virt) + ); + + // Test write + EmulateArch::write::(virt, 0x5A); + + // Test read + println!( + "0x{:X} (0x{:X}) = 0x{:X}", + virt.data(), + phys.data(), + EmulateArch::read::(virt) + ); + } + } +} diff --git a/rmm/src/page/entry.rs b/rmm/src/page/entry.rs new file mode 100644 index 0000000000..ac75f162b0 --- /dev/null +++ b/rmm/src/page/entry.rs @@ -0,0 +1,59 @@ +use core::marker::PhantomData; + +use crate::{Arch, PageFlags, PhysicalAddress}; + +#[derive(Clone, Copy, Debug)] +pub struct PageEntry { + data: usize, + phantom: PhantomData, +} + +impl PageEntry { + #[inline(always)] + pub fn new(address: usize, flags: usize) -> Self { + let data = (((address >> A::PAGE_SHIFT) & A::ENTRY_ADDRESS_MASK) << A::ENTRY_ADDRESS_SHIFT) + | flags; + Self::from_data(data) + } + + #[inline(always)] + pub fn from_data(data: usize) -> Self { + Self { + data, + phantom: PhantomData, + } + } + + #[inline(always)] + pub fn data(&self) -> usize { + self.data + } + + #[inline(always)] + pub fn address(&self) -> Result { + let addr = PhysicalAddress( + ((self.data >> A::ENTRY_ADDRESS_SHIFT) & A::ENTRY_ADDRESS_MASK) << A::PAGE_SHIFT, + ); + + if self.present() { + Ok(addr) + } else { + Err(addr) + } + } + + #[inline(always)] + pub fn flags(&self) -> PageFlags { + unsafe { PageFlags::from_data(self.data & A::ENTRY_FLAGS_MASK) } + } + #[inline(always)] + pub fn set_flags(&mut self, flags: PageFlags) { + self.data &= !A::ENTRY_FLAGS_MASK; + self.data |= flags.data(); + } + + #[inline(always)] + pub fn present(&self) -> bool { + self.data & A::ENTRY_FLAG_PRESENT != 0 + } +} diff --git a/rmm/src/page/flags.rs b/rmm/src/page/flags.rs new file mode 100644 index 0000000000..0d83d60d59 --- /dev/null +++ b/rmm/src/page/flags.rs @@ -0,0 +1,157 @@ +use core::{fmt, marker::PhantomData}; + +use crate::Arch; + +#[derive(Clone, Copy)] +pub struct PageFlags { + data: usize, + arch: PhantomData, +} + +impl PageFlags { + #[inline(always)] + pub fn new() -> Self { + unsafe { + Self::from_data( + // Flags set to present, kernel space, read-only, no-execute by default + A::ENTRY_FLAG_DEFAULT_PAGE + | A::ENTRY_FLAG_READONLY + | A::ENTRY_FLAG_NO_EXEC + | A::ENTRY_FLAG_NO_GLOBAL, + ) + } + } + + #[inline(always)] + pub fn new_table() -> Self { + unsafe { + Self::from_data( + // Flags set to present, kernel space, read-only, no-execute by default + A::ENTRY_FLAG_DEFAULT_TABLE | A::ENTRY_FLAG_NO_EXEC | A::ENTRY_FLAG_NO_GLOBAL, + ) + } + } + + #[inline(always)] + pub unsafe fn from_data(data: usize) -> Self { + Self { + data, + arch: PhantomData, + } + } + + #[inline(always)] + pub fn data(&self) -> usize { + self.data + } + + #[must_use] + #[inline(always)] + pub fn custom_flag(mut self, flag: usize, value: bool) -> Self { + if value { + self.data |= flag; + } else { + self.data &= !flag; + } + self + } + + #[must_use] + #[inline(always)] + pub fn device_memory(self, value: bool) -> Self { + self.custom_flag(A::ENTRY_FLAG_DEVICE_MEMORY, value) + } + + #[must_use] + #[inline(always)] + pub fn uncacheable(self, value: bool) -> Self { + self.custom_flag(A::ENTRY_FLAG_UNCACHEABLE, value) + } + + #[must_use] + #[inline(always)] + pub fn write_combining(self, value: bool) -> Self { + self.custom_flag(A::ENTRY_FLAG_WRITE_COMBINING, value) + } + + #[inline(always)] + pub fn has_flag(&self, flag: usize) -> bool { + self.data & flag == flag + } + + #[inline(always)] + pub fn has_present(&self) -> bool { + self.has_flag(A::ENTRY_FLAG_PRESENT) + } + + #[must_use] + #[inline(always)] + pub fn user(self, value: bool) -> Self { + self.custom_flag(A::ENTRY_FLAG_PAGE_USER, value) + } + + #[inline(always)] + pub fn has_user(&self) -> bool { + self.has_flag(A::ENTRY_FLAG_PAGE_USER) + } + + #[must_use] + #[inline(always)] + pub fn write(self, value: bool) -> Self { + // Architecture may use readonly or readwrite, or both, support either + if value { + self.custom_flag(A::ENTRY_FLAG_READONLY | A::ENTRY_FLAG_READWRITE, false) + .custom_flag(A::ENTRY_FLAG_READWRITE, true) + } else { + self.custom_flag(A::ENTRY_FLAG_READONLY | A::ENTRY_FLAG_READWRITE, false) + .custom_flag(A::ENTRY_FLAG_READONLY, true) + } + } + + #[inline(always)] + pub fn has_write(&self) -> bool { + // Architecture may use readonly or readwrite, or both, support either + self.data & (A::ENTRY_FLAG_READONLY | A::ENTRY_FLAG_READWRITE) == A::ENTRY_FLAG_READWRITE + } + + #[must_use] + #[inline(always)] + pub fn execute(self, value: bool) -> Self { + //TODO: write xor execute? + // Architecture may use no exec or exec, support either + self.custom_flag(A::ENTRY_FLAG_NO_EXEC, !value) + .custom_flag(A::ENTRY_FLAG_EXEC, value) + } + + #[inline(always)] + pub fn has_execute(&self) -> bool { + // Architecture may use no exec or exec, support either + self.data & (A::ENTRY_FLAG_NO_EXEC | A::ENTRY_FLAG_EXEC) == A::ENTRY_FLAG_EXEC + } + + #[must_use] + #[inline(always)] + pub fn global(self, value: bool) -> Self { + // Architecture may use global or non global, support either + self.custom_flag(A::ENTRY_FLAG_NO_GLOBAL, !value) + .custom_flag(A::ENTRY_FLAG_GLOBAL, value) + } + + #[inline(always)] + pub fn is_global(&self) -> bool { + // Architecture may use global or non global, support either + self.data & (A::ENTRY_FLAG_GLOBAL | A::ENTRY_FLAG_NO_GLOBAL) == A::ENTRY_FLAG_GLOBAL + } +} + +impl fmt::Debug for PageFlags { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PageFlags") + .field("present", &self.has_present()) + .field("write", &self.has_write()) + .field("executable", &self.has_execute()) + .field("user", &self.has_user()) + .field("bits", &format_args!("{:#0x}", self.data)) + .finish() + } +} diff --git a/rmm/src/page/flush.rs b/rmm/src/page/flush.rs new file mode 100644 index 0000000000..0638fce807 --- /dev/null +++ b/rmm/src/page/flush.rs @@ -0,0 +1,71 @@ +use core::{marker::PhantomData, mem}; + +use crate::{Arch, VirtualAddress}; + +pub trait Flusher { + fn consume(&mut self, flush: PageFlush); +} + +#[must_use = "The page table must be flushed, or the changes unsafely ignored"] +pub struct PageFlush { + virt: VirtualAddress, + phantom: PhantomData, +} + +impl PageFlush { + pub fn new(virt: VirtualAddress) -> Self { + Self { + virt, + phantom: PhantomData, + } + } + + pub fn flush(self) { + A::invalidate(self.virt); + } + + #[expect(clippy::forget_non_drop)] + pub unsafe fn ignore(self) { + mem::forget(self); + } +} + +// TODO: Might remove Drop and add #[must_use] again, but ergonomically I prefer being able to pass +// a flusher, and have it dropped by the end of the function it is passed to, in order to flush. +pub struct PageFlushAll { + phantom: PhantomData A>, +} + +impl PageFlushAll { + pub fn new() -> Self { + Self { + phantom: PhantomData, + } + } + + pub fn flush(self) {} + + pub unsafe fn ignore(self) { + mem::forget(self); + } +} +impl Drop for PageFlushAll { + fn drop(&mut self) { + A::invalidate_all(); + } +} +impl Flusher for PageFlushAll { + fn consume(&mut self, flush: PageFlush) { + unsafe { + flush.ignore(); + } + } +} +impl + ?Sized> Flusher for &mut T { + fn consume(&mut self, flush: PageFlush) { + >::consume(self, flush) + } +} +impl Flusher for () { + fn consume(&mut self, _: PageFlush) {} +} diff --git a/rmm/src/page/mapper.rs b/rmm/src/page/mapper.rs new file mode 100644 index 0000000000..25d794b2d5 --- /dev/null +++ b/rmm/src/page/mapper.rs @@ -0,0 +1,269 @@ +use core::marker::PhantomData; + +use crate::{ + Arch, FrameAllocator, PageEntry, PageFlags, PageFlush, PageTable, PhysicalAddress, TableKind, + VirtualAddress, +}; + +pub struct PageMapper { + table_kind: TableKind, + table_addr: PhysicalAddress, + allocator: F, + _phantom: PhantomData A>, +} + +impl PageMapper { + unsafe fn new(table_kind: TableKind, table_addr: PhysicalAddress, allocator: F) -> Self { + Self { + table_kind, + table_addr, + allocator, + _phantom: PhantomData, + } + } + + pub unsafe fn current(table_kind: TableKind, allocator: F) -> Self { + unsafe { + let table_addr = A::table(table_kind); + Self::new(table_kind, table_addr, allocator) + } + } + + pub fn is_current(&self) -> bool { + self.table().phys() == A::table(self.table_kind) + } + + pub unsafe fn make_current(&self) { + unsafe { + A::set_table(self.table_kind, self.table_addr); + } + } + + pub fn table(&self) -> PageTable { + // SAFETY: The only way to initialize a PageMapper is via new(), and we assume it upholds + // all necessary invariants for this to be safe. + unsafe { PageTable::new(VirtualAddress::new(0), self.table_addr, A::PAGE_LEVELS - 1) } + } + + pub fn allocator(&self) -> &F { + &self.allocator + } + + pub fn allocator_mut(&mut self) -> &mut F { + &mut self.allocator + } + + fn visit( + &self, + virt: VirtualAddress, + f: impl FnOnce(&mut PageTable, usize) -> T, + ) -> Option { + let mut table = self.table(); + loop { + let i = table.index_of(virt)?; + if table.level() == 0 { + return Some(f(&mut table, i)); + } else { + table = unsafe { table.next(i)? }; + } + } + } + + pub fn translate(&self, virt: VirtualAddress) -> Option<(PhysicalAddress, PageFlags)> { + let entry = self.visit(virt, |p1, i| unsafe { p1.entry(i) })??; + Some((entry.address().ok()?, entry.flags())) + } + + pub unsafe fn remap_with_full( + &mut self, + virt: VirtualAddress, + f: impl FnOnce(PhysicalAddress, PageFlags) -> Option<(PhysicalAddress, PageFlags)>, + ) -> Option<(PageFlags, PhysicalAddress, PageFlush)> { + unsafe { + self.visit(virt, |p1, i| { + let old_entry = p1.entry(i)?; + let old_phys = old_entry.address().ok()?; + let old_flags = old_entry.flags(); + let (new_phys, new_flags) = f(old_phys, old_flags)?; + // TODO: Higher-level PageEntry::new interface? + let new_entry = PageEntry::new(new_phys.data(), new_flags.data()); + p1.set_entry(i, new_entry); + Some((old_flags, old_phys, PageFlush::new(virt))) + }) + .flatten() + } + } + + pub unsafe fn remap_with( + &mut self, + virt: VirtualAddress, + map_flags: impl FnOnce(PageFlags) -> PageFlags, + ) -> Option<(PageFlags, PhysicalAddress, PageFlush)> { + unsafe { + self.remap_with_full(virt, |same_phys, old_flags| { + Some((same_phys, map_flags(old_flags))) + }) + } + } + + pub unsafe fn remap( + &mut self, + virt: VirtualAddress, + flags: PageFlags, + ) -> Option> { + unsafe { self.remap_with(virt, |_| flags).map(|(_, _, flush)| flush) } + } +} + +impl PageMapper { + pub unsafe fn create(table_kind: TableKind, mut allocator: F) -> Option { + unsafe { + let table_addr = allocator.allocate_one()?; + let mut table = Self::new(table_kind, table_addr, allocator); + + match (table_kind, A::KERNEL_SEPARATE_TABLE) { + (TableKind::Kernel, false) => { + // Pre-allocate all kernel top-level page table entries so that when + // the page table is copied, these entries are synced between processes. + for i in A::PAGE_ENTRIES / 2..A::PAGE_ENTRIES { + let phys = table + .allocator + .allocate_one() + .expect("failed to map page table"); + let flags = A::ENTRY_FLAG_DEFAULT_TABLE; + table + .table() + .set_entry(i, PageEntry::new(phys.data(), flags)); + } + } + (TableKind::User, false) => { + // Copy higher half (kernel) mappings + let active_ktable = PageMapper::current(TableKind::Kernel, ()); + for i in A::PAGE_ENTRIES / 2..A::PAGE_ENTRIES { + if let Some(entry) = active_ktable.table().entry(i) { + table.table().set_entry(i, entry); + } + } + } + (_, true) => { + // There is a separate page table for the kernel. No need to copy the kernel + // mappings to the user page table. + } + } + + Some(table) + } + } + + pub unsafe fn map_phys( + &mut self, + virt: VirtualAddress, + phys: PhysicalAddress, + flags: PageFlags, + ) -> Option> { + unsafe { + //TODO: verify virt and phys are aligned + //TODO: verify flags have correct bits + let entry = PageEntry::new(phys.data(), flags.data()); + let mut table = self.table(); + loop { + let i = table.index_of(virt)?; + if table.level() == 0 { + //TODO: check for overwriting entry + table.set_entry(i, entry); + return Some(PageFlush::new(virt)); + } + + let next = match table.next(i) { + Some(some) => some, + None => { + let next_phys = self.allocator.allocate_one()?; + //TODO: correct flags? + let flags = A::ENTRY_FLAG_DEFAULT_TABLE + | if virt.kind() == TableKind::User { + A::ENTRY_FLAG_TABLE_USER + } else { + 0 + }; + table.set_entry(i, PageEntry::new(next_phys.data(), flags)); + table.next(i)? + } + }; + table = next; + } + } + } + + pub unsafe fn map_linearly( + &mut self, + phys: PhysicalAddress, + flags: PageFlags, + ) -> Option<(VirtualAddress, PageFlush)> { + unsafe { + let virt = A::phys_to_virt(phys); + self.map_phys(virt, phys, flags).map(|flush| (virt, flush)) + } + } + + pub unsafe fn unmap_phys( + &mut self, + virt: VirtualAddress, + ) -> Option<(PhysicalAddress, PageFlags, PageFlush)> { + //TODO: verify virt is aligned + let mut table = self.table(); + + let unmap_parents = A::KERNEL_SEPARATE_TABLE || table.index_of(virt)? < A::PAGE_ENTRIES / 2; // Is a userspace mapping + + unsafe { + unmap_phys_inner(virt, &mut table, unmap_parents, &mut self.allocator) + .map(|(pa, pf)| (pa, pf, PageFlush::new(virt))) + } + } +} + +unsafe fn unmap_phys_inner( + virt: VirtualAddress, + table: &mut PageTable, + unmap_parents: bool, + allocator: &mut impl FrameAllocator, +) -> Option<(PhysicalAddress, PageFlags)> { + unsafe { + let i = table.index_of(virt)?; + + if table.level() == 0 { + let entry_opt = table.entry(i); + table.set_entry(i, PageEntry::new(0, 0)); + let entry = entry_opt?; + + return Some((entry.address().ok()?, entry.flags())); + } + + let mut subtable = table.next(i)?; + + let res = unmap_phys_inner(virt, &mut subtable, unmap_parents, allocator)?; + + if unmap_parents { + // TODO: Use a counter? This would reduce the remaining number of available bits, but could be + // faster (benchmark is needed). + let is_still_populated = (0..A::PAGE_ENTRIES) + .map(|j| subtable.entry(j).expect("must be within bounds")) + .any(|e| e.present()); + + if !is_still_populated { + allocator.free_one(subtable.phys()); + table.set_entry(i, PageEntry::new(0, 0)); + } + } + + Some(res) + } +} + +impl core::fmt::Debug for PageMapper { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("PageMapper") + .field("frame", &self.table_addr) + .field("allocator", &self.allocator) + .finish() + } +} diff --git a/rmm/src/page/mod.rs b/rmm/src/page/mod.rs new file mode 100644 index 0000000000..c4c4f00241 --- /dev/null +++ b/rmm/src/page/mod.rs @@ -0,0 +1,7 @@ +pub use self::{entry::*, flags::*, flush::*, mapper::*, table::*}; + +mod entry; +mod flags; +mod flush; +mod mapper; +mod table; diff --git a/rmm/src/page/table.rs b/rmm/src/page/table.rs new file mode 100644 index 0000000000..48796d81ff --- /dev/null +++ b/rmm/src/page/table.rs @@ -0,0 +1,105 @@ +use core::{fmt, marker::PhantomData}; + +use crate::{page::PageEntry, Arch, PhysicalAddress, VirtualAddress}; + +pub struct PageTable { + base: VirtualAddress, + phys: PhysicalAddress, + level: usize, + phantom: PhantomData, +} + +impl PageTable { + pub(super) unsafe fn new(base: VirtualAddress, phys: PhysicalAddress, level: usize) -> Self { + Self { + base, + phys, + level, + phantom: PhantomData, + } + } + + pub fn base(&self) -> VirtualAddress { + self.base + } + + pub fn phys(&self) -> PhysicalAddress { + self.phys + } + + pub fn level(&self) -> usize { + self.level + } + + pub fn entry_base(&self, i: usize) -> Option { + if i < A::PAGE_ENTRIES { + let level_shift = self.level * A::PAGE_ENTRY_SHIFT + A::PAGE_SHIFT; + Some(self.base.add(i << level_shift)) + } else { + None + } + } + + unsafe fn entry_virt(&self, i: usize) -> Option { + if i < A::PAGE_ENTRIES { + Some(A::phys_to_virt(self.phys).add(i * A::PAGE_ENTRY_SIZE)) + } else { + None + } + } + + pub unsafe fn entry(&self, i: usize) -> Option> { + unsafe { + let addr = self.entry_virt(i)?; + Some(PageEntry::from_data(A::read::(addr))) + } + } + + pub(super) unsafe fn set_entry(&mut self, i: usize, entry: PageEntry) -> Option<()> { + unsafe { + let addr = self.entry_virt(i)?; + A::write::(addr, entry.data()); + Some(()) + } + } + + pub(super) fn index_of(&self, address: VirtualAddress) -> Option { + // Canonicalize address first + let address = VirtualAddress::new(address.data() & A::PAGE_ADDRESS_MASK); + let level_shift = self.level * A::PAGE_ENTRY_SHIFT + A::PAGE_SHIFT; + // Intentionally wraps around at last-level table to get all-ones mask on architectures + // where addressable physical address space covers entire usized space (e.g. x86) + let level_mask = A::PAGE_ENTRIES + .wrapping_shl(level_shift as u32) + .wrapping_sub(1); + if address >= self.base && address <= self.base.add(level_mask) { + Some((address.data() >> level_shift) & A::PAGE_ENTRY_MASK) + } else { + None + } + } + + pub unsafe fn next(&self, i: usize) -> Option { + if self.level == 0 { + return None; + } + + unsafe { + Some(PageTable::new( + self.entry_base(i)?, + self.entry(i)?.address().ok()?, + self.level - 1, + )) + } + } + + pub fn debug_entries(&self, f: impl Fn(fmt::Arguments<'_>)) { + for i in 0..A::PAGE_ENTRIES { + if let Some(entry) = unsafe { self.entry(i) } + && entry.present() + { + f(format_args!("{}: {:X}", i, entry.data())); + } + } + } +} diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000000..42f22f6190 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +channel = "nightly-2025-10-03" +components = ["rust-src"] diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000000..5dc46d08ed --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,22 @@ +blank_lines_lower_bound = 0 # default +blank_lines_upper_bound = 1 # default +brace_style = "SameLineWhere" # default +disable_all_formatting = false # default +edition = "2024" +style_edition = "2015" +empty_item_single_line = true # default +fn_single_line = false # default +force_explicit_abi = true # default +format_strings = false # default +hard_tabs = false # default +show_parse_errors = true # default +imports_granularity = "Crate" # default = Preserve +imports_indent = "Block" # default +imports_layout = "Mixed" # default +indent_style = "Block" # default +max_width = 100 # default +newline_style = "Unix" # default = Auto +skip_children = false # default +tab_spaces = 4 # default +trailing_comma = "Vertical" # default +where_single_line = false # default diff --git a/src/acpi/gtdt.rs b/src/acpi/gtdt.rs new file mode 100644 index 0000000000..1c5743c299 --- /dev/null +++ b/src/acpi/gtdt.rs @@ -0,0 +1,64 @@ +use alloc::boxed::Box; + +use super::{find_sdt, sdt::Sdt}; +use crate::{ + arch::device::generic_timer::GenericTimer, + dtb::irqchip::{register_irq, IRQ_CHIP}, +}; + +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct Gtdt { + pub header: Sdt, + pub cnt_control_base: u64, + _reserved: u32, + pub secure_el1_timer_gsiv: u32, + pub secure_el1_timer_flags: u32, + pub non_secure_el1_timer_gsiv: u32, + pub non_secure_el1_timer_flags: u32, + pub virtual_el1_timer_gsiv: u32, + pub virtual_el1_timer_flags: u32, + pub el2_timer_gsiv: u32, + pub el2_timer_flags: u32, + pub cnt_read_base: u64, + pub platform_timer_count: u32, + pub platform_timer_offset: u32, + /*TODO: we don't need these yet, and they cause short tables to fail parsing + pub virtual_el2_timer_gsiv: u32, + pub virtual_el2_timer_flags: u32, + */ + //TODO: platform timer structure (at platform timer offset, with platform timer count) +} + +impl Gtdt { + pub fn init() { + let gtdt_sdt = find_sdt("GTDT"); + let gtdt = if gtdt_sdt.len() == 1 { + match Gtdt::new(gtdt_sdt[0]) { + Some(gtdt) => gtdt, + None => { + warn!("Failed to parse GTDT"); + return; + } + } + } else { + warn!("Unable to find GTDT"); + return; + }; + + let gsiv = gtdt.non_secure_el1_timer_gsiv; + info!("generic_timer gsiv = {}", gsiv); + let mut timer = GenericTimer::new(); + timer.init(); + register_irq(gsiv, Box::new(timer)); + unsafe { IRQ_CHIP.irq_enable(gsiv as u32) }; + } + + pub fn new(sdt: &'static Sdt) -> Option<&'static Gtdt> { + if &sdt.signature == b"GTDT" && sdt.length as usize >= size_of::() { + Some(unsafe { &*((sdt as *const Sdt) as *const Gtdt) }) + } else { + None + } + } +} diff --git a/src/acpi/hpet.rs b/src/acpi/hpet.rs new file mode 100644 index 0000000000..ad62f14a95 --- /dev/null +++ b/src/acpi/hpet.rs @@ -0,0 +1,121 @@ +use core::ptr::{self, read_volatile, write_volatile}; + +#[cfg(not(target_arch = "x86"))] +use crate::memory::{RmmA, RmmArch}; +use crate::{find_one_sdt, memory::PhysicalAddress}; + +use super::{sdt::Sdt, GenericAddressStructure, ACPI_TABLE}; + +#[repr(C, packed)] +#[derive(Clone, Copy, Debug)] +pub struct Hpet { + pub header: Sdt, + + pub hw_rev_id: u8, + pub comparator_descriptor: u8, + pub pci_vendor_id: u16, + + pub base_address: GenericAddressStructure, + + pub hpet_number: u8, + pub min_periodic_clk_tick: u16, + pub oem_attribute: u8, +} + +impl Hpet { + pub fn init() { + let hpet = Hpet::new(find_one_sdt!("HPET")); + + if let Some(hpet) = hpet { + debug!(" HPET: {:X}", hpet.hpet_number); + + let mut hpet_t = ACPI_TABLE.hpet.write(); + *hpet_t = Some(hpet); + } + } + + pub fn new(sdt: &'static Sdt) -> Option { + if &sdt.signature == b"HPET" && sdt.length as usize >= size_of::() { + let s = unsafe { ptr::read((sdt as *const Sdt) as *const Hpet) }; + if s.base_address.address_space == 0 { + unsafe { s.map() }; + Some(s) + } else { + warn!( + "HPET has unsupported address space {}", + s.base_address.address_space + ); + None + } + } else { + None + } + } +} + +//TODO: x86 use assumes only one HPET and only one GenericAddressStructure +#[cfg(target_arch = "x86")] +impl Hpet { + pub unsafe fn map(&self) { + unsafe { + use crate::memory::{Frame, KernelMapper, Page, PageFlags, VirtualAddress}; + + let frame = Frame::containing(PhysicalAddress::new(self.base_address.address as usize)); + let page = Page::containing_address(VirtualAddress::new(crate::HPET_OFFSET)); + + KernelMapper::lock_rw() + .map_phys( + page.start_address(), + frame.base(), + PageFlags::new().write(true).device_memory(true), + ) + .expect("failed to map memory for GenericAddressStructure") + .flush(); + } + } + + pub unsafe fn read_u64(&self, offset: usize) -> u64 { + unsafe { read_volatile((crate::HPET_OFFSET + offset) as *const u64) } + } + + pub unsafe fn write_u64(&mut self, offset: usize, value: u64) { + unsafe { + write_volatile((crate::HPET_OFFSET + offset) as *mut u64, value); + } + } +} + +#[cfg(not(target_arch = "x86"))] +impl Hpet { + pub unsafe fn map(&self) { + unsafe { + crate::memory::map_device_memory( + PhysicalAddress::new(self.base_address.address as usize), + crate::memory::PAGE_SIZE, + ); + } + } + + pub unsafe fn read_u64(&self, offset: usize) -> u64 { + unsafe { + read_volatile( + RmmA::phys_to_virt(PhysicalAddress::new( + self.base_address.address as usize + offset, + )) + .data() as *const u64, + ) + } + } + + pub unsafe fn write_u64(&mut self, offset: usize, value: u64) { + unsafe { + write_volatile( + RmmA::phys_to_virt(PhysicalAddress::new( + self.base_address.address as usize + offset, + )) + .data() as *mut u64, + value, + ); + } + } +} diff --git a/src/acpi/madt/arch/aarch64.rs b/src/acpi/madt/arch/aarch64.rs new file mode 100644 index 0000000000..2fa4968327 --- /dev/null +++ b/src/acpi/madt/arch/aarch64.rs @@ -0,0 +1,97 @@ +use alloc::{boxed::Box, vec::Vec}; + +use super::{Madt, MadtEntry}; +use crate::{ + arch::device::irqchip::{ + gic::{GenericInterruptController, GicCpuIf, GicDistIf}, + gicv3::{GicV3, GicV3CpuIf}, + }, + dtb::irqchip::{IrqChipItem, IRQ_CHIP}, + memory::{map_device_memory, PhysicalAddress, PAGE_SIZE}, +}; + +pub(super) fn init(madt: Madt) { + let mut gicd_opt = None; + let mut giccs = Vec::new(); + for madt_entry in madt.iter() { + debug!(" {:#x?}", madt_entry); + match madt_entry { + MadtEntry::Gicc(gicc) => { + giccs.push(gicc); + } + MadtEntry::Gicd(gicd) => { + if gicd_opt.is_some() { + warn!("Only one GICD should be present on a system, ignoring this one"); + } else { + gicd_opt = Some(gicd); + } + } + _ => {} + } + } + let Some(gicd) = gicd_opt else { + warn!("No GICD found"); + return; + }; + let mut gic_dist_if = GicDistIf::default(); + unsafe { + let phys = PhysicalAddress::new(gicd.physical_base_address as usize); + let virt = map_device_memory(phys, PAGE_SIZE); + gic_dist_if.init(virt.data()); + }; + info!("{:#x?}", gic_dist_if); + match gicd.gic_version { + 1 | 2 => { + for gicc in giccs { + let mut gic_cpu_if = GicCpuIf::default(); + unsafe { + let phys = PhysicalAddress::new(gicc.physical_base_address as usize); + let virt = map_device_memory(phys, PAGE_SIZE); + gic_cpu_if.init(virt.data()) + }; + info!("{:#x?}", gic_cpu_if); + let gic = GenericInterruptController { + gic_dist_if, + gic_cpu_if, + irq_range: (0, 0), + }; + let chip = IrqChipItem { + phandle: 0, + parents: Vec::new(), + children: Vec::new(), + ic: Box::new(gic), + }; + unsafe { IRQ_CHIP.irq_chip_list.chips.push(chip) }; + //TODO: support more GICCs + break; + } + } + 3 => { + for gicc in giccs { + let mut gic_cpu_if = GicV3CpuIf; + unsafe { gic_cpu_if.init() }; + info!("{:#x?}", gic_cpu_if); + let gic = GicV3 { + gic_dist_if, + gic_cpu_if, + //TODO: get GICRs + gicrs: Vec::new(), + irq_range: (0, 0), + }; + let chip = IrqChipItem { + phandle: 0, + parents: Vec::new(), + children: Vec::new(), + ic: Box::new(gic), + }; + unsafe { IRQ_CHIP.irq_chip_list.chips.push(chip) }; + //TODO: support more GICCs + break; + } + } + _ => { + warn!("unsupported GIC version {}", gicd.gic_version); + } + } + unsafe { IRQ_CHIP.init(None) }; +} diff --git a/src/acpi/madt/arch/other.rs b/src/acpi/madt/arch/other.rs new file mode 100644 index 0000000000..6400e09afb --- /dev/null +++ b/src/acpi/madt/arch/other.rs @@ -0,0 +1,9 @@ +use super::Madt; + +pub(super) fn init(madt: Madt) { + for madt_entry in madt.iter() { + debug!(" {:#x?}", madt_entry); + } + + warn!("MADT not yet handled on this platform"); +} diff --git a/src/acpi/madt/arch/x86.rs b/src/acpi/madt/arch/x86.rs new file mode 100644 index 0000000000..4dc2388398 --- /dev/null +++ b/src/acpi/madt/arch/x86.rs @@ -0,0 +1,160 @@ +use core::{ + hint, + sync::atomic::{AtomicU8, Ordering}, +}; + +use crate::{ + arch::{ + device::local_apic::the_local_apic, + start::{kstart_ap, KernelArgsAp}, + }, + cpu_set::LogicalCpuId, + memory::{ + allocate_p2frame, Frame, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA, RmmArch, + VirtualAddress, PAGE_SIZE, + }, + startup::AP_READY, +}; + +use super::{Madt, MadtEntry}; + +const TRAMPOLINE: usize = 0x8000; +static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline")); + +pub(super) fn init(madt: Madt) { + let local_apic = unsafe { the_local_apic() }; + let me = local_apic.id(); + + if local_apic.x2 { + debug!(" X2APIC {}", me.get()); + } else { + debug!(" XAPIC {}: {:>08X}", me.get(), local_apic.address); + } + + if cfg!(not(feature = "multi_core")) { + return; + } + + // Map trampoline + let trampoline_frame = Frame::containing(PhysicalAddress::new(TRAMPOLINE)); + let trampoline_page = Page::containing_address(VirtualAddress::new(TRAMPOLINE)); + let (result, page_table_physaddr) = unsafe { + //TODO: do not have writable and executable! + let mut mapper = KernelMapper::lock_rw(); + + let result = mapper + .map_phys( + trampoline_page.start_address(), + trampoline_frame.base(), + PageFlags::new().execute(true).write(true), + ) + .expect("failed to map trampoline"); + + (result, mapper.table().phys().data()) + }; + result.flush(); + + // Write trampoline, make sure TRAMPOLINE page is free for use + for (i, val) in TRAMPOLINE_DATA.iter().enumerate() { + unsafe { + (*((TRAMPOLINE as *mut u8).add(i) as *const AtomicU8)).store(*val, Ordering::SeqCst); + } + } + + unsafe { + let preliminary_cpu_count = madt.iter().filter(|e| matches!(e, MadtEntry::LocalApic(entry) if u32::from(entry.id) == me.get() || entry.flags & 1 == 1)).count(); + crate::profiling::allocate(preliminary_cpu_count as u32); + } + + for madt_entry in madt.iter() { + debug!(" {:x?}", madt_entry); + if let MadtEntry::LocalApic(ap_local_apic) = madt_entry { + if u32::from(ap_local_apic.id) == me.get() { + debug!(" This is my local APIC"); + } else if ap_local_apic.flags & 1 == 1 { + let cpu_id = LogicalCpuId::next(); + + // Allocate a stack + let stack_start = RmmA::phys_to_virt( + allocate_p2frame(4) + .expect("no more frames in acpi stack_start") + .base(), + ) + .data(); + let stack_end = stack_start + (PAGE_SIZE << 4); + + let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end); + + let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id); + + let args = KernelArgsAp { + stack_end: stack_end as *mut u8, + cpu_id, + pcr_ptr, + idt_ptr, + }; + + let ap_ready = (TRAMPOLINE + 8) as *mut u64; + let ap_args_ptr = unsafe { ap_ready.add(1) }; + let ap_page_table = unsafe { ap_ready.add(2) }; + let ap_code = unsafe { ap_ready.add(3) }; + + // Set the ap_ready to 0, volatile + unsafe { + ap_ready.write(0); + ap_args_ptr.write(&args as *const _ as u64); + ap_page_table.write(page_table_physaddr as u64); + #[expect(clippy::fn_to_numeric_cast)] + ap_code.write(kstart_ap as u64); + + // TODO: Is this necessary (this fence)? + core::arch::asm!(""); + }; + AP_READY.store(false, Ordering::SeqCst); + + // Send INIT IPI + { + let mut icr = 0x4500; + if local_apic.x2 { + icr |= u64::from(ap_local_apic.id) << 32; + } else { + icr |= u64::from(ap_local_apic.id) << 56; + } + local_apic.set_icr(icr); + } + + // Send START IPI + { + let ap_segment = (TRAMPOLINE >> 12) & 0xFF; + let mut icr = 0x4600 | ap_segment as u64; + + if local_apic.x2 { + icr |= u64::from(ap_local_apic.id) << 32; + } else { + icr |= u64::from(ap_local_apic.id) << 56; + } + + local_apic.set_icr(icr); + } + + // Wait for trampoline ready + while unsafe { (*ap_ready.cast::()).load(Ordering::SeqCst) } == 0 { + hint::spin_loop(); + } + while !AP_READY.load(Ordering::SeqCst) { + hint::spin_loop(); + } + + RmmA::invalidate_all(); + } + } + } + + // Unmap trampoline + let (_frame, _, flush) = unsafe { + KernelMapper::lock_rw() + .unmap_phys(trampoline_page.start_address()) + .expect("failed to unmap trampoline page") + }; + flush.flush(); +} diff --git a/src/acpi/madt/mod.rs b/src/acpi/madt/mod.rs new file mode 100644 index 0000000000..3159b9c497 --- /dev/null +++ b/src/acpi/madt/mod.rs @@ -0,0 +1,240 @@ +use core::cell::SyncUnsafeCell; + +use super::sdt::Sdt; +use crate::find_one_sdt; + +/// The Multiple APIC Descriptor Table +#[derive(Clone, Copy, Debug)] +pub struct Madt { + sdt: &'static Sdt, + pub local_address: u32, + pub flags: u32, +} + +#[cfg(target_arch = "aarch64")] +#[path = "arch/aarch64.rs"] +mod arch; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[path = "arch/x86.rs"] +mod arch; + +#[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))] +#[path = "arch/other.rs"] +mod arch; + +static MADT: SyncUnsafeCell> = SyncUnsafeCell::new(None); +pub fn madt() -> Option<&'static Madt> { + unsafe { &*MADT.get() }.as_ref() +} +pub const FLAG_PCAT: u32 = 1; + +impl Madt { + pub fn init() { + let madt = Madt::new(find_one_sdt!("APIC")); + + if let Some(madt) = madt { + // safe because no APs have been started yet. + unsafe { MADT.get().write(Some(madt)) }; + + debug!(" APIC: {:>08X}: {}", madt.local_address, madt.flags); + + arch::init(madt); + } + } + + pub fn new(sdt: &'static Sdt) -> Option { + if &sdt.signature == b"APIC" && sdt.data_len() >= 8 { + //Not valid if no local address and flags + let local_address = unsafe { (sdt.data_address() as *const u32).read_unaligned() }; + let flags = unsafe { + (sdt.data_address() as *const u32) + .offset(1) + .read_unaligned() + }; + + Some(Madt { + sdt, + local_address, + flags, + }) + } else { + None + } + } + + pub fn iter(&self) -> MadtIter { + MadtIter { + sdt: self.sdt, + i: 8, // Skip local controller address and flags + } + } +} + +/// MADT Local APIC +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct MadtLocalApic { + /// Processor ID + pub processor: u8, + /// Local APIC ID + pub id: u8, + /// Flags. 1 means that the processor is enabled + pub flags: u32, +} + +/// MADT I/O APIC +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct MadtIoApic { + /// I/O APIC ID + pub id: u8, + /// reserved + _reserved: u8, + /// I/O APIC address + pub address: u32, + /// Global system interrupt base + pub gsi_base: u32, +} + +/// MADT Interrupt Source Override +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct MadtIntSrcOverride { + /// Bus Source + pub bus_source: u8, + /// IRQ Source + pub irq_source: u8, + /// Global system interrupt base + pub gsi_base: u32, + /// Flags + pub flags: u16, +} + +/// MADT GICC +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct MadtGicc { + _reserved: u16, + pub cpu_interface_number: u32, + pub acpi_processor_uid: u32, + pub flags: u32, + pub parking_protocol_version: u32, + pub performance_interrupt_gsiv: u32, + pub parked_address: u64, + pub physical_base_address: u64, + pub gicv: u64, + pub gich: u64, + pub vgic_maintenance_interrupt: u32, + pub gicr_base_address: u64, + pub mpidr: u64, + pub processor_power_efficiency_class: u8, + _reserved2: u8, + pub spe_overflow_interrupt: u16, + //TODO: optional field introduced in ACPI 6.5: pub trbe_interrupt: u16, +} + +/// MADT GICD +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct MadtGicd { + _reserved: u16, + pub gic_id: u32, + pub physical_base_address: u64, + pub system_vector_base: u32, + pub gic_version: u8, + _reserved2: [u8; 3], +} + +/// MADT Entries +#[derive(Debug)] +#[allow(dead_code)] +pub enum MadtEntry { + LocalApic(&'static MadtLocalApic), + InvalidLocalApic(usize), + IoApic(&'static MadtIoApic), + InvalidIoApic(usize), + IntSrcOverride(&'static MadtIntSrcOverride), + InvalidIntSrcOverride(usize), + Gicc(&'static MadtGicc), + InvalidGicc(usize), + Gicd(&'static MadtGicd), + InvalidGicd(usize), + Unknown(u8), +} + +pub struct MadtIter { + sdt: &'static Sdt, + i: usize, +} + +impl Iterator for MadtIter { + type Item = MadtEntry; + fn next(&mut self) -> Option { + if self.i + 1 < self.sdt.data_len() { + let entry_type = unsafe { *(self.sdt.data_address() as *const u8).add(self.i) }; + let entry_len = + unsafe { *(self.sdt.data_address() as *const u8).add(self.i + 1) } as usize; + + if self.i + entry_len <= self.sdt.data_len() { + let item = match entry_type { + 0x0 => { + if entry_len == size_of::() + 2 { + MadtEntry::LocalApic(unsafe { + &*((self.sdt.data_address() + self.i + 2) as *const MadtLocalApic) + }) + } else { + MadtEntry::InvalidLocalApic(entry_len) + } + } + 0x1 => { + if entry_len == size_of::() + 2 { + MadtEntry::IoApic(unsafe { + &*((self.sdt.data_address() + self.i + 2) as *const MadtIoApic) + }) + } else { + MadtEntry::InvalidIoApic(entry_len) + } + } + 0x2 => { + if entry_len == size_of::() + 2 { + MadtEntry::IntSrcOverride(unsafe { + &*((self.sdt.data_address() + self.i + 2) + as *const MadtIntSrcOverride) + }) + } else { + MadtEntry::InvalidIntSrcOverride(entry_len) + } + } + 0xB => { + if entry_len >= size_of::() + 2 { + MadtEntry::Gicc(unsafe { + &*((self.sdt.data_address() + self.i + 2) as *const MadtGicc) + }) + } else { + MadtEntry::InvalidGicc(entry_len) + } + } + 0xC => { + if entry_len >= size_of::() + 2 { + MadtEntry::Gicd(unsafe { + &*((self.sdt.data_address() + self.i + 2) as *const MadtGicd) + }) + } else { + MadtEntry::InvalidGicd(entry_len) + } + } + _ => MadtEntry::Unknown(entry_type), + }; + + self.i += entry_len; + + Some(item) + } else { + None + } + } else { + None + } + } +} diff --git a/src/acpi/mod.rs b/src/acpi/mod.rs new file mode 100644 index 0000000000..59e3526544 --- /dev/null +++ b/src/acpi/mod.rs @@ -0,0 +1,212 @@ +//! # ACPI +//! Code to parse the ACPI tables + +use alloc::{boxed::Box, string::String, vec::Vec}; + +use hashbrown::HashMap; +use spin::{Once, RwLock}; + +use crate::memory::{KernelMapper, PageFlags, PhysicalAddress, RmmA, RmmArch}; + +use self::{hpet::Hpet, madt::Madt, rsdp::Rsdp, rsdt::Rsdt, rxsdt::Rxsdt, sdt::Sdt, xsdt::Xsdt}; + +#[cfg(target_arch = "aarch64")] +mod gtdt; +pub mod hpet; +pub mod madt; +mod rsdp; +mod rsdt; +mod rxsdt; +pub mod sdt; +#[cfg(target_arch = "aarch64")] +mod spcr; +mod xsdt; + +unsafe fn map_linearly(addr: PhysicalAddress, len: usize, mapper: &mut crate::memory::PageMapper) { + unsafe { + let base = PhysicalAddress::new(crate::memory::round_down_pages(addr.data())); + let aligned_len = crate::memory::round_up_pages(len + (addr.data() - base.data())); + + for page_idx in 0..aligned_len / crate::memory::PAGE_SIZE { + let (_, flush) = mapper + .map_linearly( + base.add(page_idx * crate::memory::PAGE_SIZE), + PageFlags::new(), + ) + .expect("failed to linearly map SDT"); + flush.flush(); + } + } +} + +pub fn get_sdt(sdt_address: PhysicalAddress, mapper: &mut KernelMapper) -> &'static Sdt { + let sdt; + + unsafe { + const SDT_SIZE: usize = size_of::(); + map_linearly(sdt_address, SDT_SIZE, mapper); + + sdt = &*(RmmA::phys_to_virt(sdt_address).data() as *const Sdt); + + map_linearly( + sdt_address.add(SDT_SIZE), + sdt.length as usize - SDT_SIZE, + mapper, + ); + } + sdt +} + +#[repr(C, packed)] +#[derive(Clone, Copy, Debug, Default)] +pub struct GenericAddressStructure { + pub address_space: u8, + pub bit_width: u8, + pub bit_offset: u8, + pub access_size: u8, + pub address: u64, +} + +pub enum RxsdtEnum { + Rsdt(Rsdt), + Xsdt(Xsdt), +} +impl Rxsdt for RxsdtEnum { + fn iter(&self) -> Box> { + match self { + Self::Rsdt(rsdt) => ::iter(rsdt), + Self::Xsdt(xsdt) => ::iter(xsdt), + } + } +} + +pub static RXSDT_ENUM: Once = Once::new(); + +/// Parse the ACPI tables to gather CPU, interrupt, and timer information +pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) { + unsafe { + { + let mut sdt_ptrs = SDT_POINTERS.write(); + *sdt_ptrs = Some(HashMap::new()); + } + + // Search for RSDP + let rsdp_opt = Rsdp::get_rsdp(already_supplied_rsdp); + + if let Some(rsdp) = rsdp_opt { + debug!("SDT address: {:#x}", rsdp.sdt_address().data()); + let rxsdt = get_sdt(rsdp.sdt_address(), &mut KernelMapper::lock_rw()); + + let rxsdt = if let Some(rsdt) = Rsdt::new(rxsdt) { + let mut initialized = false; + + let rsdt = RXSDT_ENUM.call_once(|| { + initialized = true; + + RxsdtEnum::Rsdt(rsdt) + }); + + if !initialized { + error!("RXSDT_ENUM already initialized"); + } + + rsdt + } else if let Some(xsdt) = Xsdt::new(rxsdt) { + let mut initialized = false; + + let xsdt = RXSDT_ENUM.call_once(|| { + initialized = true; + + RxsdtEnum::Xsdt(xsdt) + }); + if !initialized { + error!("RXSDT_ENUM already initialized"); + } + + xsdt + } else { + warn!("UNKNOWN RSDT OR XSDT SIGNATURE"); + return; + }; + + // TODO: Don't touch ACPI tables in kernel? + + for sdt in rxsdt.iter() { + get_sdt(sdt, &mut KernelMapper::lock_rw()); + } + + for sdt_address in rxsdt.iter() { + let sdt = &*(RmmA::phys_to_virt(sdt_address).data() as *const Sdt); + + let signature = get_sdt_signature(sdt); + if let Some(ref mut ptrs) = *(SDT_POINTERS.write()) { + ptrs.insert(signature, sdt); + } + } + + // TODO: Enumerate processors in userspace, and then provide an ACPI-independent interface + // to initialize enumerated processors to userspace? + Madt::init(); + //TODO: support this on any arch + // SPCR must be initialized after MADT for interrupt controllers + #[cfg(target_arch = "aarch64")] + spcr::Spcr::init(); + // TODO: Let userspace setup HPET, and then provide an interface to specify which timer to + // use? + Hpet::init(); + #[cfg(target_arch = "aarch64")] + gtdt::Gtdt::init(); + } else { + error!("NO RSDP FOUND"); + } + } +} + +pub type SdtSignature = (String, [u8; 6], [u8; 8]); +pub static SDT_POINTERS: RwLock>> = RwLock::new(None); + +pub fn find_sdt(name: &str) -> Vec<&'static Sdt> { + let mut sdts: Vec<&'static Sdt> = vec![]; + + if let Some(ref ptrs) = *(SDT_POINTERS.read()) { + for (signature, sdt) in ptrs { + if signature.0 == name { + sdts.push(sdt); + } + } + } + + sdts +} + +#[macro_export] +macro_rules! find_one_sdt { + ($name:expr) => {{ + use $crate::acpi::find_sdt; + match find_sdt($name).as_slice() { + [] => { + println!("Unable to find {}", $name); + return; + } + [x] => *x, + x => { + println!("{} {} found, expected 1", x.len(), $name); + return; + } + } + }}; +} + +pub fn get_sdt_signature(sdt: &'static Sdt) -> SdtSignature { + let signature = + String::from_utf8(sdt.signature.to_vec()).expect("Error converting signature to string"); + (signature, sdt.oem_id, sdt.oem_table_id) +} + +pub struct Acpi { + pub hpet: RwLock>, +} + +pub static ACPI_TABLE: Acpi = Acpi { + hpet: RwLock::new(None), +}; diff --git a/src/acpi/rsdp.rs b/src/acpi/rsdp.rs new file mode 100644 index 0000000000..f10c5ac989 --- /dev/null +++ b/src/acpi/rsdp.rs @@ -0,0 +1,34 @@ +use rmm::PhysicalAddress; + +/// RSDP +#[derive(Copy, Clone, Debug)] +#[repr(C, packed)] +pub struct Rsdp { + signature: [u8; 8], + _checksum: u8, + _oemid: [u8; 6], + revision: u8, + rsdt_address: u32, + _length: u32, + xsdt_address: u64, + _extended_checksum: u8, + _reserved: [u8; 3], +} + +impl Rsdp { + pub unsafe fn get_rsdp(already_supplied_rsdp: Option<*const u8>) -> Option { + already_supplied_rsdp.map(|rsdp_ptr| { + // TODO: Validate + unsafe { *(rsdp_ptr as *const Rsdp) } + }) + } + + /// Get the RSDT or XSDT address + pub fn sdt_address(&self) -> PhysicalAddress { + PhysicalAddress::new(if self.revision >= 2 { + self.xsdt_address as usize + } else { + self.rsdt_address as usize + }) + } +} diff --git a/src/acpi/rsdt.rs b/src/acpi/rsdt.rs new file mode 100644 index 0000000000..e5e7d54428 --- /dev/null +++ b/src/acpi/rsdt.rs @@ -0,0 +1,52 @@ +use alloc::boxed::Box; +use core::convert::TryFrom; +use rmm::PhysicalAddress; + +use super::{rxsdt::Rxsdt, sdt::Sdt}; + +#[derive(Debug)] +pub struct Rsdt(&'static Sdt); + +impl Rsdt { + pub fn new(sdt: &'static Sdt) -> Option { + if &sdt.signature == b"RSDT" { + Some(Rsdt(sdt)) + } else { + None + } + } + pub fn as_slice(&self) -> &[u8] { + let length = + usize::try_from(self.0.length).expect("expected 32-bit length to fit within usize"); + + unsafe { core::slice::from_raw_parts(self.0 as *const _ as *const u8, length) } + } +} + +impl Rxsdt for Rsdt { + fn iter(&self) -> Box> { + Box::new(RsdtIter { sdt: self.0, i: 0 }) + } +} + +pub struct RsdtIter { + sdt: &'static Sdt, + i: usize, +} + +impl Iterator for RsdtIter { + type Item = PhysicalAddress; + fn next(&mut self) -> Option { + if self.i < self.sdt.data_len() / size_of::() { + let item = unsafe { + (self.sdt.data_address() as *const u32) + .add(self.i) + .read_unaligned() + }; + self.i += 1; + Some(PhysicalAddress::new(item as usize)) + } else { + None + } + } +} diff --git a/src/acpi/rxsdt.rs b/src/acpi/rxsdt.rs new file mode 100644 index 0000000000..a4bc6c967f --- /dev/null +++ b/src/acpi/rxsdt.rs @@ -0,0 +1,6 @@ +use alloc::boxed::Box; +use rmm::PhysicalAddress; + +pub trait Rxsdt { + fn iter(&self) -> Box>; +} diff --git a/src/acpi/sdt.rs b/src/acpi/sdt.rs new file mode 100644 index 0000000000..83ff67dac1 --- /dev/null +++ b/src/acpi/sdt.rs @@ -0,0 +1,27 @@ +#[derive(Copy, Clone, Debug)] +#[repr(C, packed)] +pub struct Sdt { + pub signature: [u8; 4], + pub length: u32, + pub revision: u8, + pub checksum: u8, + pub oem_id: [u8; 6], + pub oem_table_id: [u8; 8], + pub oem_revision: u32, + pub creator_id: u32, + pub creator_revision: u32, +} + +impl Sdt { + /// Get the address of this tables data + pub fn data_address(&self) -> usize { + self as *const _ as usize + size_of::() + } + + /// Get the length of this tables data + pub fn data_len(&self) -> usize { + let total_size = self.length as usize; + let header_size = size_of::(); + total_size.saturating_sub(header_size) + } +} diff --git a/src/acpi/spcr.rs b/src/acpi/spcr.rs new file mode 100644 index 0000000000..5f55f1edb5 --- /dev/null +++ b/src/acpi/spcr.rs @@ -0,0 +1,140 @@ +use super::{find_sdt, sdt::Sdt, GenericAddressStructure}; +use crate::{ + arch::device::serial::COM1, + devices::{serial::SerialKind, uart_pl011}, + log::LOG, + memory::{map_device_memory, PhysicalAddress, PAGE_SIZE}, +}; + +const INTERRUPT_TYPE_8259: u8 = 1 << 0; +const INTERRUPT_TYPE_APIC: u8 = 1 << 1; +const INTERRUPT_TYPE_SAPIC: u8 = 1 << 2; +const INTERRUPT_TYPE_GIC: u8 = 1 << 3; +const INTERRUPT_TYPE_PLIC: u8 = 1 << 4; + +#[derive(Clone, Copy, Debug)] +#[repr(C, packed)] +pub struct Spcr { + pub header: Sdt, + pub interface_type: u8, + _reserved: [u8; 3], + pub base_address: GenericAddressStructure, + pub interrupt_type: u8, + pub irq: u8, + pub gsiv: u32, + pub configured_baud_rate: u8, + pub parity: u8, + pub stop_bits: u8, + pub flow_control: u8, + pub terminal_type: u8, + pub language: u8, + pub pci_device_id: u16, + pub pci_vendor_id: u16, + pub pci_bus: u8, + pub pci_device: u8, + pub pci_function: u8, + pub pci_flags: u32, + pub pci_segment: u8, + /*TODO: these fields are optional based on the table revision + pub uart_clock_frequency: u32, + pub precise_baud_rate: u32, + pub namespace_string_length: u16, + pub namespace_string_offset: u16, + */ + // namespace_string +} + +impl Spcr { + pub fn init() { + let spcr_sdt = find_sdt("SPCR"); + let spcr = if spcr_sdt.len() == 1 { + match Spcr::new(spcr_sdt[0]) { + Some(spcr) => spcr, + None => { + warn!("Failed to parse SPCR"); + return; + } + } + } else { + warn!("Unable to find SPCR"); + return; + }; + + if spcr.base_address.address == 0 { + // Serial disabled + return; + } + + let serial_was_empty = !matches!(*COM1.lock(), SerialKind::NotPresent); + if spcr.header.revision >= 2 { + match spcr.interface_type { + 3 => { + // PL011 + if spcr.base_address.address_space == 0 + && spcr.base_address.bit_width == 32 + && spcr.base_address.bit_offset == 0 + && spcr.base_address.access_size == 3 + { + let virt = unsafe { + map_device_memory( + PhysicalAddress::new(spcr.base_address.address as usize), + PAGE_SIZE, + ) + }; + let serial_port = uart_pl011::SerialPort::new(virt.data(), false); + *COM1.lock() = SerialKind::Pl011(serial_port); + //TODO: enable IRQ on more platforms and interrupt types + if (spcr.interrupt_type & INTERRUPT_TYPE_GIC) == INTERRUPT_TYPE_GIC { + #[cfg(target_arch = "aarch64")] + unsafe { + crate::arch::device::serial::init_acpi(spcr.gsiv); + } + } + } else { + warn!( + "SPCR unsuppoted address for PL011 {:#x?}", + spcr.base_address + ); + } + } + //TODO: support more types! + unsupported => { + warn!( + "SPCR revision {} unsupported interface type {}", + spcr.header.revision, unsupported + ); + } + } + } else if spcr.header.revision == 1 { + match spcr.interface_type { + //TODO: support more types! + unsupported => { + warn!("SPCR revision 1 unsupported interface type {}", unsupported); + } + } + } else { + warn!("SPCR unsupported revision {}", spcr.header.revision); + } + let mut serial_port = COM1.lock(); + if serial_was_empty && !matches!(*serial_port, SerialKind::NotPresent) { + // backfill logs since the heap is loaded + if let Some(ref mut early_log) = *LOG.lock() { + let (s1, s2) = early_log.read(); + if !s1.is_empty() { + serial_port.write(s1); + } + if !s2.is_empty() { + serial_port.write(s2); + } + } + } + } + + pub fn new(sdt: &'static Sdt) -> Option<&'static Spcr> { + if &sdt.signature == b"SPCR" && sdt.length as usize >= size_of::() { + Some(unsafe { &*((sdt as *const Sdt) as *const Spcr) }) + } else { + None + } + } +} diff --git a/src/acpi/xsdt.rs b/src/acpi/xsdt.rs new file mode 100644 index 0000000000..bb59065e52 --- /dev/null +++ b/src/acpi/xsdt.rs @@ -0,0 +1,50 @@ +use alloc::boxed::Box; +use core::convert::TryFrom; +use rmm::PhysicalAddress; + +use super::{rxsdt::Rxsdt, sdt::Sdt}; + +#[derive(Debug)] +pub struct Xsdt(&'static Sdt); + +impl Xsdt { + pub fn new(sdt: &'static Sdt) -> Option { + if &sdt.signature == b"XSDT" { + Some(Xsdt(sdt)) + } else { + None + } + } + pub fn as_slice(&self) -> &[u8] { + let length = + usize::try_from(self.0.length).expect("expected 32-bit length to fit within usize"); + + unsafe { core::slice::from_raw_parts(self.0 as *const _ as *const u8, length) } + } +} + +impl Rxsdt for Xsdt { + fn iter(&self) -> Box> { + Box::new(XsdtIter { sdt: self.0, i: 0 }) + } +} + +pub struct XsdtIter { + sdt: &'static Sdt, + i: usize, +} + +impl Iterator for XsdtIter { + type Item = PhysicalAddress; + fn next(&mut self) -> Option { + if self.i < self.sdt.data_len() / size_of::() { + let item = unsafe { + core::ptr::read_unaligned((self.sdt.data_address() as *const u64).add(self.i)) + }; + self.i += 1; + Some(PhysicalAddress::new(item as usize)) + } else { + None + } + } +} diff --git a/src/allocator/linked_list.rs b/src/allocator/linked_list.rs new file mode 100644 index 0000000000..f17ef4abdf --- /dev/null +++ b/src/allocator/linked_list.rs @@ -0,0 +1,50 @@ +use crate::memory::KernelMapper; +use core::{ + alloc::{GlobalAlloc, Layout}, + ptr::NonNull, +}; +use linked_list_allocator::Heap; +use spin::Mutex; + +static HEAP: Mutex> = Mutex::new(None); + +pub struct Allocator; + +impl Allocator { + pub unsafe fn init(offset: usize, size: usize) { + unsafe { + *HEAP.lock() = Some(Heap::new(offset, size)); + } + } +} + +unsafe impl GlobalAlloc for Allocator { + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + unsafe { + while let Some(ref mut heap) = *HEAP.lock() { + match heap.allocate_first_fit(layout) { + Ok(ptr) => return ptr.as_ptr(), + Err(()) => { + let size = heap.size(); + super::map_heap( + &mut KernelMapper::lock_rw(), + crate::kernel_heap_offset() + size, + super::KERNEL_HEAP_SIZE, + ); + heap.extend(super::KERNEL_HEAP_SIZE); + } + } + } + panic!("__rust_allocate: heap not initialized"); + } + } + + unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { + unsafe { + HEAP.lock() + .as_mut() + .expect("heap not initialized") + .deallocate(NonNull::new_unchecked(ptr), layout) + } + } +} diff --git a/src/allocator/mod.rs b/src/allocator/mod.rs new file mode 100644 index 0000000000..4fdb0ba16e --- /dev/null +++ b/src/allocator/mod.rs @@ -0,0 +1,48 @@ +use crate::memory::{KernelMapper, Page, PageFlags, VirtualAddress}; +use rmm::{Flusher, FrameAllocator, PageFlushAll}; + +pub use self::linked_list::Allocator; +mod linked_list; + +/// Size of kernel heap +const KERNEL_HEAP_SIZE: usize = ::rmm::MEGABYTE; + +unsafe fn map_heap(mapper: &mut KernelMapper, offset: usize, size: usize) { + let mut flush_all = PageFlushAll::new(); + + let heap_start_page = Page::containing_address(VirtualAddress::new(offset)); + let heap_end_page = Page::containing_address(VirtualAddress::new(offset + size - 1)); + for page in Page::range_inclusive(heap_start_page, heap_end_page) { + let phys = mapper + .allocator_mut() + .allocate_one() + .expect("failed to allocate kernel heap"); + let flush = unsafe { + mapper + .map_phys( + page.start_address(), + phys, + PageFlags::new() + .write(true) + .global(cfg!(not(feature = "pti"))), + ) + .expect("failed to map kernel heap") + }; + flush_all.consume(flush); + } + + flush_all.flush(); +} + +pub unsafe fn init() { + unsafe { + let offset = crate::kernel_heap_offset(); + let size = KERNEL_HEAP_SIZE; + + // Map heap pages + map_heap(&mut KernelMapper::lock_rw(), offset, size); + + // Initialize global heap + Allocator::init(offset, size); + } +} diff --git a/src/arch/aarch64/consts.rs b/src/arch/aarch64/consts.rs new file mode 100644 index 0000000000..ea0a640149 --- /dev/null +++ b/src/arch/aarch64/consts.rs @@ -0,0 +1,15 @@ +// Because the memory map is so important to not be aliased, it is defined here, in one place +// The lower 256 PML4 entries are reserved for userspace +// Each PML4 entry references up to 512 GB of memory +// The second from the top (510) PML4 is reserved for the kernel +/// The size of a single PML4 +pub const PML4_SIZE: usize = 0x0000_0080_0000_0000; + +/// Offset to kernel heap +#[inline(always)] +pub fn kernel_heap_offset() -> usize { + crate::kernel_executable_offsets::KERNEL_OFFSET() - PML4_SIZE +} + +/// End offset of the user image, i.e. kernel start +pub const USER_END_OFFSET: usize = 256 * PML4_SIZE; diff --git a/src/arch/aarch64/debug.rs b/src/arch/aarch64/debug.rs new file mode 100644 index 0000000000..b66a081c2e --- /dev/null +++ b/src/arch/aarch64/debug.rs @@ -0,0 +1,19 @@ +use spin::MutexGuard; + +use crate::{arch::device::serial::COM1, devices::serial::SerialKind}; + +pub struct Writer<'a> { + serial: MutexGuard<'a, SerialKind>, +} + +impl<'a> Writer<'a> { + pub fn new() -> Writer<'a> { + Writer { + serial: COM1.lock(), + } + } + + pub fn write(&mut self, buf: &[u8]) { + self.serial.write(buf); + } +} diff --git a/src/arch/aarch64/device/cpu/mod.rs b/src/arch/aarch64/device/cpu/mod.rs new file mode 100644 index 0000000000..1b59edfd87 --- /dev/null +++ b/src/arch/aarch64/device/cpu/mod.rs @@ -0,0 +1,277 @@ +use core::fmt::{Result, Write}; + +use crate::arch::device::cpu::registers::{control_regs, id_regs}; + +pub mod registers; + +bitfield::bitfield! { + pub struct MachineId(u32); + get_implementer, _: 31, 24; + get_variant, _: 23, 20; + get_architecture, _: 19, 16; + get_part_number, _: 15, 4; + get_revision, _: 3, 0; +} + +enum ImplementerID { + Unknown, + Arm, + Broadcom, + Cavium, + Digital, + Fujitsu, + Infineon, + Motorola, + Nvidia, + AMCC, + Qualcomm, + Marvell, + Intel, + Ampere, +} + +const IMPLEMENTERS: [&'static str; 14] = [ + "Unknown", "Arm", "Broadcom", "Cavium", "Digital", "Fujitsu", "Infineon", "Motorola", "Nvidia", + "AMCC", "Qualcomm", "Marvell", "Intel", "Ampere", +]; + +enum VariantID { + Unknown, +} + +const VARIANTS: [&'static str; 1] = ["Unknown"]; + +enum ArchitectureID { + Unknown, + V4, + V4T, + V5, + V5T, + V5TE, + V5TEJ, + V6, +} + +const ARCHITECTURES: [&'static str; 8] = + ["Unknown", "v4", "v4T", "v5", "v5T", "v5TE", "v5TEJ", "v6"]; + +enum PartNumberID { + Unknown, + Thunder, + Foundation, + CortexA35, + CortexA53, + CortexA55, + CortexA57, + CortexA72, + CortexA73, + CortexA75, +} + +const PART_NUMBERS: [&'static str; 10] = [ + "Unknown", + "Thunder", + "Foundation", + "Cortex-A35", + "Cortex-A53", + "Cortex-A55", + "Cortex-A57", + "Cortex-A72", + "Cortex-A73", + "Cortex-A75", +]; + +enum RevisionID { + Unknown, + Thunder1_0, + Thunder1_1, +} + +const REVISIONS: [&'static str; 3] = ["Unknown", "Thunder-1.0", "Thunder-1.1"]; + +struct CpuInfo { + implementer: &'static str, + variant: &'static str, + architecture: &'static str, + part_number: &'static str, + revision: &'static str, + aa64isar0: id_regs::AA64Isar0, + aa64isar1: id_regs::AA64Isar1, +} + +impl CpuInfo { + fn new() -> CpuInfo { + let midr = unsafe { control_regs::midr() }; + let midr = MachineId(midr); + + let implementer = match midr.get_implementer() { + 0x41 => IMPLEMENTERS[ImplementerID::Arm as usize], + 0x42 => IMPLEMENTERS[ImplementerID::Broadcom as usize], + 0x43 => IMPLEMENTERS[ImplementerID::Cavium as usize], + 0x44 => IMPLEMENTERS[ImplementerID::Digital as usize], + 0x46 => IMPLEMENTERS[ImplementerID::Fujitsu as usize], + 0x49 => IMPLEMENTERS[ImplementerID::Infineon as usize], + 0x4d => IMPLEMENTERS[ImplementerID::Motorola as usize], + 0x4e => IMPLEMENTERS[ImplementerID::Nvidia as usize], + 0x50 => IMPLEMENTERS[ImplementerID::AMCC as usize], + 0x51 => IMPLEMENTERS[ImplementerID::Qualcomm as usize], + 0x56 => IMPLEMENTERS[ImplementerID::Marvell as usize], + 0x69 => IMPLEMENTERS[ImplementerID::Intel as usize], + 0xc0 => IMPLEMENTERS[ImplementerID::Ampere as usize], + _ => IMPLEMENTERS[ImplementerID::Unknown as usize], + }; + + let variant = match midr.get_variant() { + _ => VARIANTS[VariantID::Unknown as usize], + }; + + let architecture = match midr.get_architecture() { + 0b0001 => ARCHITECTURES[ArchitectureID::V4 as usize], + 0b0010 => ARCHITECTURES[ArchitectureID::V4T as usize], + 0b0011 => ARCHITECTURES[ArchitectureID::V5 as usize], + 0b0100 => ARCHITECTURES[ArchitectureID::V5T as usize], + 0b0101 => ARCHITECTURES[ArchitectureID::V5TE as usize], + 0b0110 => ARCHITECTURES[ArchitectureID::V5TEJ as usize], + 0b0111 => ARCHITECTURES[ArchitectureID::V6 as usize], + _ => ARCHITECTURES[ArchitectureID::Unknown as usize], + }; + + let part_number = match midr.get_part_number() { + 0x0a1 => PART_NUMBERS[PartNumberID::Thunder as usize], + 0xd00 => PART_NUMBERS[PartNumberID::Foundation as usize], + 0xd04 => PART_NUMBERS[PartNumberID::CortexA35 as usize], + 0xd03 => PART_NUMBERS[PartNumberID::CortexA53 as usize], + 0xd05 => PART_NUMBERS[PartNumberID::CortexA55 as usize], + 0xd07 => PART_NUMBERS[PartNumberID::CortexA57 as usize], + 0xd08 => PART_NUMBERS[PartNumberID::CortexA72 as usize], + 0xd09 => PART_NUMBERS[PartNumberID::CortexA73 as usize], + 0xd0a => PART_NUMBERS[PartNumberID::CortexA75 as usize], + _ => PART_NUMBERS[PartNumberID::Unknown as usize], + }; + + let revision = match part_number { + "Thunder" => { + let val = match midr.get_revision() { + 0x00 => REVISIONS[RevisionID::Thunder1_0 as usize], + 0x01 => REVISIONS[RevisionID::Thunder1_1 as usize], + _ => REVISIONS[RevisionID::Unknown as usize], + }; + val + } + _ => REVISIONS[RevisionID::Unknown as usize], + }; + + let aa64isar0 = id_regs::aa64isar0(); + let aa64isar1 = id_regs::aa64isar1(); + + CpuInfo { + implementer, + variant, + architecture, + part_number, + revision, + aa64isar0, + aa64isar1, + } + } +} + +pub fn cpu_info(w: &mut W) -> Result { + let cpuinfo = CpuInfo::new(); + + writeln!(w, "Implementer: {}", cpuinfo.implementer)?; + writeln!(w, "Variant: {}", cpuinfo.variant)?; + writeln!(w, "Architecture version: {}", cpuinfo.architecture)?; + writeln!(w, "Part Number: {}", cpuinfo.part_number)?; + writeln!(w, "Revision: {}", cpuinfo.revision)?; + + // Print detected CPU features. + // Follow the naming convention estabilished by `std::arch::is_aarch64_feature_detected`. + write!(w, "Features:")?; + + // ID_AA64ISAR0_EL1 + if cpuinfo.aa64isar0.has_feat_rng() { + write!(w, " rand")?; + } + if cpuinfo.aa64isar0.has_feat_flagm() { + write!(w, " flagm")?; + } + if cpuinfo.aa64isar0.has_feat_flagm2() { + write!(w, " flagm2")?; + } + if cpuinfo.aa64isar0.has_feat_fhm() { + write!(w, " fhm")?; + } + if cpuinfo.aa64isar0.has_feat_dotprod() { + write!(w, " dotprod")?; + } + if cpuinfo.aa64isar0.has_feat_sm3() && cpuinfo.aa64isar0.has_feat_sm4() { + write!(w, " sm4")?; + } + if cpuinfo.aa64isar0.has_feat_sha512() && cpuinfo.aa64isar0.has_feat_sha3() { + write!(w, " sha3")?; + } + if cpuinfo.aa64isar0.has_feat_rdm() { + write!(w, " rdm")?; + } + if cpuinfo.aa64isar0.has_feat_lse() { + write!(w, " lse")?; + } + if cpuinfo.aa64isar0.has_feat_lse128() { + write!(w, " lse128")?; + } + if cpuinfo.aa64isar0.has_feat_crc() { + write!(w, " crc")?; + } + if cpuinfo.aa64isar0.has_feat_sha1() && cpuinfo.aa64isar0.has_feat_sha256() { + write!(w, " sha2")?; + } + if cpuinfo.aa64isar0.has_feat_aes() && cpuinfo.aa64isar0.has_feat_pmull() { + write!(w, " aes")?; + } + + // ID_AA64ISAR1_EL1 + if cpuinfo.aa64isar1.has_feat_i8mm() { + write!(w, " i8mm")?; + } + if cpuinfo.aa64isar1.has_feat_bf16() { + write!(w, " bf16")?; + } + if cpuinfo.aa64isar1.has_feat_sb() { + write!(w, " sb")?; + } + if cpuinfo.aa64isar1.has_feat_frintts() { + write!(w, " frintts")?; + } + if cpuinfo.aa64isar1.gpi() != 0 || cpuinfo.aa64isar1.gpa() != 0 { + write!(w, " pacg")?; + } + if cpuinfo.aa64isar1.has_feat_lrcpc() { + write!(w, " rcpc")?; + } + if cpuinfo.aa64isar1.has_feat_lrcpc2() { + write!(w, " rcpc2")?; + } + if cpuinfo.aa64isar1.has_feat_lrcpc3() { + write!(w, " rcpc3")?; + } + if cpuinfo.aa64isar1.has_feat_fcma() { + write!(w, " fcma")?; + } + if cpuinfo.aa64isar1.has_feat_jscvt() { + write!(w, " jsconv")?; + } + if cpuinfo.aa64isar1.api() != 0 || cpuinfo.aa64isar1.apa() != 0 { + write!(w, " paca")?; + } + if cpuinfo.aa64isar1.has_feat_dpb() { + write!(w, " dpb")?; + } + if cpuinfo.aa64isar1.has_feat_dpb2() { + write!(w, " dpb2")?; + } + + writeln!(w)?; + + Ok(()) +} diff --git a/src/arch/aarch64/device/cpu/registers/control_regs.rs b/src/arch/aarch64/device/cpu/registers/control_regs.rs new file mode 100644 index 0000000000..e24d3d243a --- /dev/null +++ b/src/arch/aarch64/device/cpu/registers/control_regs.rs @@ -0,0 +1,167 @@ +#![allow(unused)] + +//! Functions to read and write control registers. + +use core::arch::asm; + +pub unsafe fn ttbr0_el1() -> u64 { + unsafe { + let ret: u64; + asm!("mrs {}, ttbr0_el1", out(reg) ret); + ret + } +} + +pub unsafe fn ttbr0_el1_write(val: u64) { + unsafe { + asm!("msr ttbr0_el1, {}", in(reg) val); + } +} + +pub unsafe fn ttbr1_el1() -> u64 { + unsafe { + let ret: u64; + asm!("mrs {}, ttbr1_el1", out(reg) ret); + ret + } +} + +pub unsafe fn ttbr1_el1_write(val: u64) { + unsafe { + asm!("msr ttbr1_el1, {}", in(reg) val); + } +} + +pub unsafe fn tpidr_el0() -> u64 { + unsafe { + let ret: u64; + asm!("mrs {}, tpidr_el0", out(reg) ret); + ret + } +} + +pub unsafe fn tpidr_el0_write(val: u64) { + unsafe { + asm!("msr tpidr_el0, {}", in(reg) val); + } +} + +pub unsafe fn tpidr_el1() -> u64 { + unsafe { + let ret: u64; + asm!("mrs {}, tpidr_el1", out(reg) ret); + ret + } +} + +pub unsafe fn tpidr_el1_write(val: u64) { + unsafe { + asm!("msr tpidr_el1, {}", in(reg) val); + } +} + +pub unsafe fn tpidrro_el0() -> u64 { + unsafe { + let ret: u64; + asm!("mrs {}, tpidrro_el0", out(reg) ret); + ret + } +} + +pub unsafe fn tpidrro_el0_write(val: u64) { + unsafe { + asm!("msr tpidrro_el0, {}", in(reg) val); + } +} + +pub unsafe fn esr_el1() -> u32 { + unsafe { + let ret: u32; + asm!("mrs {0:w}, esr_el1", out(reg) ret); + ret + } +} + +pub unsafe fn vhe_present() -> bool { + unsafe { + let mut mmfr1: u64; + asm!("mrs {}, id_aa64mmfr1_el1", out(reg) mmfr1); + + // The VHE (Virtualization Host Extensions) field is in bits [7:4]. + let vhe_field = (mmfr1 >> 4) & 0b1111; + + vhe_field != 0 + } +} + +pub unsafe fn cntfrq_el0() -> u32 { + unsafe { + let ret: usize; + asm!("mrs {}, cntfrq_el0", out(reg) ret); + ret as u32 + } +} + +pub unsafe fn ptmr_ctrl() -> u32 { + unsafe { + let ret: usize; + asm!("mrs {}, cntp_ctl_el0", out(reg) ret); + ret as u32 + } +} + +pub unsafe fn ptmr_ctrl_write(val: u32) { + unsafe { + asm!("msr cntp_ctl_el0, {}", in(reg) val as usize); + } +} + +pub unsafe fn ptmr_tval() -> u32 { + unsafe { + let ret: usize; + asm!("mrs {0}, cntp_tval_el0", out(reg) ret); + ret as u32 + } +} + +pub unsafe fn ptmr_tval_write(val: u32) { + unsafe { + asm!("msr cntp_tval_el0, {}", in(reg) val as usize); + } +} + +pub unsafe fn vtmr_ctrl() -> u32 { + unsafe { + let ret: usize; + asm!("mrs {}, cntv_ctl_el0", out(reg) ret); + ret as u32 + } +} + +pub unsafe fn vtmr_ctrl_write(val: u32) { + unsafe { + asm!("msr cntv_ctl_el0, {}", in(reg) val as usize); + } +} + +pub unsafe fn vtmr_tval() -> u32 { + unsafe { + let ret: usize; + asm!("mrs {0}, cntv_tval_el0", out(reg) ret); + ret as u32 + } +} + +pub unsafe fn vtmr_tval_write(val: u32) { + unsafe { + asm!("msr cntv_tval_el0, {}", in(reg) val as usize); + } +} + +pub unsafe fn midr() -> u32 { + unsafe { + let ret: usize; + asm!("mrs {}, midr_el1", out(reg) ret); + ret as u32 + } +} diff --git a/src/arch/aarch64/device/cpu/registers/id_regs.rs b/src/arch/aarch64/device/cpu/registers/id_regs.rs new file mode 100644 index 0000000000..83332b5cdb --- /dev/null +++ b/src/arch/aarch64/device/cpu/registers/id_regs.rs @@ -0,0 +1,151 @@ +//! Functions and bitfield definitions for `ID_AA64*` system registers. (e.g. `ID_AA64ISAR0_EL1`) + +use core::arch::asm; + +bitfield::bitfield! { + pub struct AA64Isar0(u64); + impl Debug; + pub rndr, _: 63, 60; + pub tlb, _: 59, 56; + pub ts, _: 55, 52; + pub fhm, _: 51, 48; + pub dp, _: 47, 44; + pub sm4, _: 43, 40; + pub sm3, _: 39, 36; + pub sha3, _: 35, 32; + pub rdm, _: 31, 28; + pub atomic, _: 23, 20; + pub crc32, _: 19, 16; + pub sha2, _: 15, 12; + pub sha1, _: 11, 8; + pub aes, _: 7, 4; +} + +bitfield::bitfield! { + pub struct AA64Isar1(u64); + impl Debug; + pub ls64, _: 63, 60; + pub xs, _: 59, 56; + pub i8mm, _: 55, 52; + pub dgh, _: 51, 48; + pub bf16, _: 47, 44; + pub specres, _: 43, 40; + pub sb, _: 39, 36; + pub frintts, _: 35, 32; + pub gpi, _: 31, 28; + pub gpa, _: 27, 24; + pub lrcpc, _: 23, 20; + pub fcma, _: 19, 16; + pub jscvt, _: 15, 12; + pub api, _: 11, 8; + pub apa, _: 7, 4; + pub dpb, _: 3, 0; +} + +impl AA64Isar0 { + pub fn has_feat_rng(&self) -> bool { + self.rndr() == 0b0001 + } + pub fn has_feat_flagm(&self) -> bool { + self.ts() == 0b0001 + } + pub fn has_feat_flagm2(&self) -> bool { + self.ts() == 0b0010 + } + pub fn has_feat_fhm(&self) -> bool { + self.fhm() == 0b0001 + } + pub fn has_feat_dotprod(&self) -> bool { + self.dp() == 0b0001 + } + pub fn has_feat_sm4(&self) -> bool { + self.sm4() == 0b0001 + } + pub fn has_feat_sm3(&self) -> bool { + self.sm3() == 0b0001 + } + pub fn has_feat_sha3(&self) -> bool { + self.sha3() == 0b0001 + } + pub fn has_feat_rdm(&self) -> bool { + self.rdm() == 0b0001 + } + pub fn has_feat_lse(&self) -> bool { + self.atomic() == 0b0010 + } + pub fn has_feat_lse128(&self) -> bool { + self.atomic() == 0b0011 + } + /// The current Arm Architecture Registers Manual calls it FEAT_CRC32, + /// but everyone else seems to call it FEAT_CRC. + pub fn has_feat_crc(&self) -> bool { + self.crc32() == 0b0001 + } + pub fn has_feat_sha256(&self) -> bool { + self.sha2() == 0b0001 + } + pub fn has_feat_sha512(&self) -> bool { + self.sha2() == 0b0010 + } + pub fn has_feat_sha1(&self) -> bool { + self.sha1() == 0b0001 + } + pub fn has_feat_aes(&self) -> bool { + self.aes() == 0b0001 + } + pub fn has_feat_pmull(&self) -> bool { + self.aes() == 0b0010 + } +} + +impl AA64Isar1 { + pub fn has_feat_i8mm(&self) -> bool { + self.i8mm() == 0b0001 + } + pub fn has_feat_bf16(&self) -> bool { + self.bf16() == 0b0001 + } + pub fn has_feat_sb(&self) -> bool { + self.sb() == 0b0001 + } + pub fn has_feat_frintts(&self) -> bool { + self.frintts() == 0b0001 + } + pub fn has_feat_lrcpc(&self) -> bool { + self.lrcpc() == 0b0001 + } + pub fn has_feat_lrcpc2(&self) -> bool { + self.lrcpc() == 0b0010 + } + pub fn has_feat_lrcpc3(&self) -> bool { + self.lrcpc() == 0b0011 + } + pub fn has_feat_fcma(&self) -> bool { + self.fcma() == 0b0001 + } + pub fn has_feat_jscvt(&self) -> bool { + self.jscvt() == 0b0011 + } + pub fn has_feat_dpb(&self) -> bool { + self.dpb() == 0b0001 + } + pub fn has_feat_dpb2(&self) -> bool { + self.dpb() == 0b0010 + } +} + +pub fn aa64isar0() -> AA64Isar0 { + let ret: u64; + unsafe { + asm!("mrs {}, ID_AA64ISAR0_EL1", out(reg) ret); + } + AA64Isar0(ret) +} + +pub fn aa64isar1() -> AA64Isar1 { + let ret: u64; + unsafe { + asm!("mrs {}, ID_AA64ISAR1_EL1", out(reg) ret); + } + AA64Isar1(ret) +} diff --git a/src/arch/aarch64/device/cpu/registers/mod.rs b/src/arch/aarch64/device/cpu/registers/mod.rs new file mode 100644 index 0000000000..3b0472a82d --- /dev/null +++ b/src/arch/aarch64/device/cpu/registers/mod.rs @@ -0,0 +1,2 @@ +pub mod control_regs; +pub mod id_regs; diff --git a/src/arch/aarch64/device/generic_timer.rs b/src/arch/aarch64/device/generic_timer.rs new file mode 100644 index 0000000000..b3c29c92f5 --- /dev/null +++ b/src/arch/aarch64/device/generic_timer.rs @@ -0,0 +1,145 @@ +use alloc::boxed::Box; + +use super::ic_for_chip; +use crate::{ + arch::device::cpu::registers::control_regs, + context::{self, timeout}, + dtb::{ + get_interrupt, + irqchip::{register_irq, InterruptHandler, IRQ_CHIP}, + }, + scheme::irq::irq_trigger, + sync::CleanLockToken, + time, +}; +use fdt::Fdt; + +bitflags! { + struct TimerCtrlFlags: u32 { + const ENABLE = 1 << 0; + const IMASK = 1 << 1; + const ISTATUS = 1 << 2; + } +} + +pub unsafe fn init(fdt: &Fdt) { + unsafe { + let mut timer = GenericTimer::new(); + timer.init(); + if let Some(node) = fdt.find_compatible(&["arm,armv7-timer"]) { + let irq = get_interrupt(fdt, &node, 1).unwrap(); + debug!("irq = {:?}", irq); + if let Some(ic_idx) = ic_for_chip(&fdt, &node) { + //PHYS_NONSECURE_PPI only + let virq = IRQ_CHIP.irq_chip_list.chips[ic_idx] + .ic + .irq_xlate(irq) + .unwrap(); + info!("generic_timer virq = {}", virq); + register_irq(virq as u32, Box::new(timer)); + IRQ_CHIP.irq_enable(virq as u32); + } else { + error!("Failed to find irq parent for generic timer"); + } + } + } +} + +pub struct GenericTimer { + pub use_virtual_timer: bool, + pub clk_freq: u32, + pub reload_count: u32, +} + +impl GenericTimer { + pub fn new() -> Self { + Self { + use_virtual_timer: false, + clk_freq: 0, + reload_count: 0, + } + } + pub fn init(&mut self) { + self.use_virtual_timer = unsafe { !control_regs::vhe_present() }; + debug!( + "generic_timer use_virtual_timer = {:?}", + self.use_virtual_timer + ); + let clk_freq = unsafe { control_regs::cntfrq_el0() }; + self.clk_freq = clk_freq; + self.reload_count = clk_freq / 100; + self.reload_count(); + } + + fn read_tmr_ctrl(&self) -> TimerCtrlFlags { + TimerCtrlFlags::from_bits_truncate(if self.use_virtual_timer { + unsafe { control_regs::vtmr_ctrl() } + } else { + unsafe { control_regs::ptmr_ctrl() } + }) + } + + fn write_tmr_ctrl(&self, ctrl: TimerCtrlFlags) { + if self.use_virtual_timer { + unsafe { control_regs::vtmr_ctrl_write(ctrl.bits()) }; + } else { + unsafe { control_regs::ptmr_ctrl_write(ctrl.bits()) }; + } + } + + #[allow(unused)] + fn disable(&self) { + let mut ctrl = self.read_tmr_ctrl(); + ctrl.remove(TimerCtrlFlags::ENABLE); + self.write_tmr_ctrl(ctrl); + } + + #[allow(unused)] + pub fn set_irq(&mut self) { + let mut ctrl = self.read_tmr_ctrl(); + ctrl.remove(TimerCtrlFlags::IMASK); + self.write_tmr_ctrl(ctrl); + } + + pub fn clear_irq(&mut self) { + let mut ctrl = self.read_tmr_ctrl(); + + if ctrl.contains(TimerCtrlFlags::ISTATUS) { + ctrl.insert(TimerCtrlFlags::IMASK); + self.write_tmr_ctrl(ctrl); + } + } + + pub fn reload_count(&mut self) { + if self.use_virtual_timer { + unsafe { control_regs::vtmr_tval_write(self.reload_count) }; + } else { + unsafe { control_regs::ptmr_tval_write(self.reload_count) }; + } + let mut ctrl = self.read_tmr_ctrl(); + ctrl.insert(TimerCtrlFlags::ENABLE); + ctrl.remove(TimerCtrlFlags::IMASK); + self.write_tmr_ctrl(ctrl); + } +} + +impl InterruptHandler for GenericTimer { + fn irq_handler(&mut self, irq: u32, token: &mut CleanLockToken) { + self.clear_irq(); + { + *time::OFFSET.write(token.token()) += self.clk_freq as u128; + } + + timeout::trigger(token); + context::switch::tick(token); + + unsafe { + // FIXME add_irq accepts a u8 as irq number + // PercpuBlock::current().stats.add_irq(irq); + + irq_trigger(irq.try_into().unwrap(), token); + IRQ_CHIP.irq_eoi(irq); + } + self.reload_count(); + } +} diff --git a/src/arch/aarch64/device/irqchip/gic.rs b/src/arch/aarch64/device/irqchip/gic.rs new file mode 100644 index 0000000000..d5aef88c95 --- /dev/null +++ b/src/arch/aarch64/device/irqchip/gic.rs @@ -0,0 +1,288 @@ +use super::InterruptController; +use crate::{ + dtb::{ + get_mmio_address, + irqchip::{InterruptHandler, IrqCell, IrqDesc}, + }, + sync::CleanLockToken, +}; +use core::ptr::{read_volatile, write_volatile}; +use fdt::{node::FdtNode, Fdt}; +use syscall::{ + error::{Error, EINVAL}, + Result, +}; + +static GICD_CTLR: u32 = 0x000; +static GICD_TYPER: u32 = 0x004; +static GICD_ISENABLER: u32 = 0x100; +static GICD_ICENABLER: u32 = 0x180; +static GICD_IPRIORITY: u32 = 0x400; +static GICD_ITARGETSR: u32 = 0x800; +static GICD_ICFGR: u32 = 0xc00; + +static GICC_EOIR: u32 = 0x0010; +static GICC_IAR: u32 = 0x000c; +static GICC_CTLR: u32 = 0x0000; +static GICC_PMR: u32 = 0x0004; + +pub struct GenericInterruptController { + pub gic_dist_if: GicDistIf, + pub gic_cpu_if: GicCpuIf, + pub irq_range: (usize, usize), +} + +impl GenericInterruptController { + pub fn new() -> Self { + let gic_dist_if = GicDistIf::default(); + let gic_cpu_if = GicCpuIf::default(); + + GenericInterruptController { + gic_dist_if, + gic_cpu_if, + irq_range: (0, 0), + } + } + pub fn parse(fdt: &Fdt) -> Result<(usize, usize, usize, usize)> { + if let Some(node) = fdt.find_compatible(&["arm,cortex-a15-gic", "arm,gic-400"]) { + return GenericInterruptController::parse_inner(fdt, &node); + } else { + return Err(Error::new(EINVAL)); + } + } + fn parse_inner(fdt: &Fdt, node: &FdtNode) -> Result<(usize, usize, usize, usize)> { + //assert address_cells == 0x2, size_cells == 0x2 + let reg = node.reg().unwrap(); + let mut regs = (0, 0, 0, 0); + let mut idx = 0; + + for chunk in reg { + if chunk.size.is_none() { + break; + } + let addr = get_mmio_address(fdt, node, &chunk).unwrap(); + match idx { + 0 => (regs.0, regs.1) = (addr, chunk.size.unwrap()), + 2 => (regs.2, regs.3) = (addr, chunk.size.unwrap()), + _ => break, + } + idx += 2; + } + + if idx == 4 { + Ok(regs) + } else { + Err(Error::new(EINVAL)) + } + } +} + +impl InterruptHandler for GenericInterruptController { + fn irq_handler(&mut self, _irq: u32, token: &mut CleanLockToken) {} +} + +impl InterruptController for GenericInterruptController { + fn irq_init( + &mut self, + fdt_opt: Option<&Fdt>, + irq_desc: &mut [IrqDesc; 1024], + ic_idx: usize, + irq_idx: &mut usize, + ) -> Result<()> { + if let Some(fdt) = fdt_opt { + let (dist_addr, _dist_size, cpu_addr, _cpu_size) = + match GenericInterruptController::parse(fdt) { + Ok(regs) => regs, + Err(err) => return Err(err), + }; + + unsafe { + self.gic_dist_if.init(crate::PHYS_OFFSET + dist_addr); + self.gic_cpu_if.init(crate::PHYS_OFFSET + cpu_addr); + } + } + let idx = *irq_idx; + let cnt = if self.gic_dist_if.nirqs > 1024 { + 1024 + } else { + self.gic_dist_if.nirqs as usize + }; + let mut i: usize = 0; + //only support linear irq map now. + while i < cnt && (idx + i < 1024) { + irq_desc[idx + i].basic.ic_idx = ic_idx; + irq_desc[idx + i].basic.ic_irq = i as u32; + irq_desc[idx + i].basic.used = true; + + i += 1; + } + + info!("gic irq_range = ({}, {})", idx, idx + cnt); + self.irq_range = (idx, idx + cnt); + *irq_idx = idx + cnt; + Ok(()) + } + fn irq_ack(&mut self) -> u32 { + unsafe { self.gic_cpu_if.irq_ack() } + } + fn irq_eoi(&mut self, irq_num: u32) { + unsafe { self.gic_cpu_if.irq_eoi(irq_num) } + } + fn irq_enable(&mut self, irq_num: u32) { + unsafe { self.gic_dist_if.irq_enable(irq_num) } + } + fn irq_disable(&mut self, irq_num: u32) { + unsafe { self.gic_dist_if.irq_disable(irq_num) } + } + fn irq_xlate(&self, irq_data: IrqCell) -> Result { + let off = match irq_data { + IrqCell::L3(0, irq, _flags) => irq as usize + 32, // SPI + IrqCell::L3(1, irq, _flags) => irq as usize + 16, // PPI + _ => return Err(Error::new(EINVAL)), + }; + return Ok(off + self.irq_range.0); + } + fn irq_to_virq(&self, hwirq: u32) -> Option { + if hwirq >= self.gic_dist_if.nirqs { + None + } else { + Some(self.irq_range.0 + hwirq as usize) + } + } +} + +#[derive(Debug, Default)] +pub struct GicDistIf { + pub address: usize, + pub ncpus: u32, + pub nirqs: u32, +} + +impl GicDistIf { + pub unsafe fn init(&mut self, addr: usize) { + unsafe { + self.address = addr; + + // Disable IRQ Distribution + self.write(GICD_CTLR, 0); + + let typer = self.read(GICD_TYPER); + self.ncpus = ((typer & (0x7 << 5)) >> 5) + 1; + self.nirqs = ((typer & 0x1f) + 1) * 32; + info!( + "gic: Distributor supports {:?} CPUs and {:?} IRQs", + self.ncpus, self.nirqs + ); + + // Set all SPIs to level triggered + for irq in (32..self.nirqs).step_by(16) { + self.write(GICD_ICFGR + ((irq / 16) * 4), 0); + } + + // Disable all SPIs + for irq in (32..self.nirqs).step_by(32) { + self.write(GICD_ICENABLER + ((irq / 32) * 4), 0xffff_ffff); + } + + // Affine all SPIs to CPU0 and set priorities for all IRQs + for irq in 0..self.nirqs { + if irq > 31 { + let ext_offset = GICD_ITARGETSR + (4 * (irq / 4)); + let int_offset = irq % 4; + let mut val = self.read(ext_offset); + val |= 0b0000_0001 << (8 * int_offset); + self.write(ext_offset, val); + } + + let ext_offset = GICD_IPRIORITY + (4 * (irq / 4)); + let int_offset = irq % 4; + let mut val = self.read(ext_offset); + val |= 0b0000_0000 << (8 * int_offset); + self.write(ext_offset, val); + } + + // Enable IRQ group 0 and group 1 non-secure distribution + self.write(GICD_CTLR, 0x3); + } + } + + pub unsafe fn irq_enable(&mut self, irq: u32) { + unsafe { + let offset = GICD_ISENABLER + (4 * (irq / 32)); + let shift = 1 << (irq % 32); + let mut val = self.read(offset); + val |= shift; + self.write(offset, val); + } + } + + pub unsafe fn irq_disable(&mut self, irq: u32) { + unsafe { + let offset = GICD_ICENABLER + (4 * (irq / 32)); + let shift = 1 << (irq % 32); + let mut val = self.read(offset); + val |= shift; + self.write(offset, val); + } + } + + unsafe fn read(&self, reg: u32) -> u32 { + unsafe { + let val = read_volatile((self.address + reg as usize) as *const u32); + val + } + } + + unsafe fn write(&mut self, reg: u32, value: u32) { + unsafe { + write_volatile((self.address + reg as usize) as *mut u32, value); + } + } +} + +#[derive(Debug, Default)] +pub struct GicCpuIf { + pub address: usize, +} + +impl GicCpuIf { + pub unsafe fn init(&mut self, addr: usize) { + unsafe { + self.address = addr; + + // Enable CPU0's GIC interface + self.write(GICC_CTLR, 1); + // Set CPU0's Interrupt Priority Mask + self.write(GICC_PMR, 0xff); + } + } + + unsafe fn irq_ack(&mut self) -> u32 { + unsafe { + let irq = self.read(GICC_IAR) & 0x1ff; + if irq == 1023 { + panic!("irq_ack: got ID 1023!!!"); + } + irq + } + } + + unsafe fn irq_eoi(&mut self, irq: u32) { + unsafe { + self.write(GICC_EOIR, irq); + } + } + + unsafe fn read(&self, reg: u32) -> u32 { + unsafe { + let val = read_volatile((self.address + reg as usize) as *const u32); + val + } + } + + unsafe fn write(&mut self, reg: u32, value: u32) { + unsafe { + write_volatile((self.address + reg as usize) as *mut u32, value); + } + } +} diff --git a/src/arch/aarch64/device/irqchip/gicv3.rs b/src/arch/aarch64/device/irqchip/gicv3.rs new file mode 100644 index 0000000000..9d8a0d5211 --- /dev/null +++ b/src/arch/aarch64/device/irqchip/gicv3.rs @@ -0,0 +1,196 @@ +use alloc::vec::Vec; +use core::arch::asm; +use fdt::{node::NodeProperty, Fdt}; + +use super::{gic::GicDistIf, InterruptController}; +use crate::{ + dtb::{ + get_mmio_address, + irqchip::{InterruptHandler, IrqCell, IrqDesc}, + }, + sync::CleanLockToken, +}; +use syscall::{ + error::{Error, EINVAL}, + Result, +}; + +#[derive(Debug)] +pub struct GicV3 { + pub gic_dist_if: GicDistIf, + pub gic_cpu_if: GicV3CpuIf, + pub gicrs: Vec<(usize, usize)>, + //TODO: GICC, GICH, GICV? + pub irq_range: (usize, usize), +} + +impl GicV3 { + pub fn new() -> Self { + GicV3 { + gic_dist_if: GicDistIf::default(), + gic_cpu_if: GicV3CpuIf, + gicrs: Vec::new(), + irq_range: (0, 0), + } + } + + pub fn parse(&mut self, fdt: &Fdt) -> Result<()> { + let Some(node) = fdt.find_compatible(&["arm,gic-v3"]) else { + return Err(Error::new(EINVAL)); + }; + + // Clear current registers + //TODO: deinit? + self.gic_dist_if.address = 0; + self.gicrs.clear(); + + // Get number of GICRs + let gicrs = node + .property("#redistributor-regions") + .and_then(NodeProperty::as_usize) + .unwrap_or(1); + + // Read registers + let mut chunks = node.reg().unwrap(); + if let Some(gicd) = chunks.next() + && let Some(addr) = get_mmio_address(fdt, &node, &gicd) + { + unsafe { + self.gic_dist_if.init(crate::PHYS_OFFSET + addr); + } + } + for _ in 0..gicrs { + if let Some(gicr) = chunks.next() { + self.gicrs.push(( + get_mmio_address(fdt, &node, &gicr).unwrap(), + gicr.size.unwrap(), + )); + } + } + + if self.gic_dist_if.address == 0 || self.gicrs.is_empty() { + Err(Error::new(EINVAL)) + } else { + Ok(()) + } + } +} + +impl InterruptHandler for GicV3 { + fn irq_handler(&mut self, _irq: u32, token: &mut CleanLockToken) {} +} + +impl InterruptController for GicV3 { + fn irq_init( + &mut self, + fdt_opt: Option<&Fdt>, + irq_desc: &mut [IrqDesc; 1024], + ic_idx: usize, + irq_idx: &mut usize, + ) -> Result<()> { + if let Some(fdt) = fdt_opt { + self.parse(fdt)?; + } + info!("{:X?}", self); + + unsafe { + self.gic_cpu_if.init(); + } + let idx = *irq_idx; + let cnt = if self.gic_dist_if.nirqs > 1024 { + 1024 + } else { + self.gic_dist_if.nirqs as usize + }; + let mut i: usize = 0; + //only support linear irq map now. + while i < cnt && (idx + i < 1024) { + irq_desc[idx + i].basic.ic_idx = ic_idx; + irq_desc[idx + i].basic.ic_irq = i as u32; + irq_desc[idx + i].basic.used = true; + + i += 1; + } + + info!("gic irq_range = ({}, {})", idx, idx + cnt); + self.irq_range = (idx, idx + cnt); + *irq_idx = idx + cnt; + Ok(()) + } + fn irq_ack(&mut self) -> u32 { + let irq_num = unsafe { self.gic_cpu_if.irq_ack() }; + irq_num + } + fn irq_eoi(&mut self, irq_num: u32) { + unsafe { self.gic_cpu_if.irq_eoi(irq_num) } + } + fn irq_enable(&mut self, irq_num: u32) { + unsafe { self.gic_dist_if.irq_enable(irq_num) } + } + fn irq_disable(&mut self, irq_num: u32) { + unsafe { self.gic_dist_if.irq_disable(irq_num) } + } + fn irq_xlate(&self, irq_data: IrqCell) -> Result { + let off = match irq_data { + IrqCell::L3(0, irq, _flags) => irq as usize + 32, // SPI + IrqCell::L3(1, irq, _flags) => irq as usize + 16, // PPI + _ => return Err(Error::new(EINVAL)), + }; + return Ok(off + self.irq_range.0); + } + fn irq_to_virq(&self, hwirq: u32) -> Option { + if hwirq >= self.gic_dist_if.nirqs { + None + } else { + Some(self.irq_range.0 + hwirq as usize) + } + } +} + +#[derive(Debug)] +pub struct GicV3CpuIf; + +impl GicV3CpuIf { + pub unsafe fn init(&mut self) { + unsafe { + // Enable system register access + { + let value = 1_usize; + asm!("msr icc_sre_el1, {}", in(reg) value); + } + // Set control register + { + let value = 0_usize; + asm!("msr icc_ctlr_el1, {}", in(reg) value); + } + // Enable non-secure group 1 + { + let value = 1_usize; + asm!("msr icc_igrpen1_el1, {}", in(reg) value); + } + // Set CPU0's Interrupt Priority Mask + { + let value = 0xFF_usize; + asm!("msr icc_pmr_el1, {}", in(reg) value); + } + } + } + + unsafe fn irq_ack(&mut self) -> u32 { + unsafe { + let mut irq: usize; + asm!("mrs {}, icc_iar1_el1", out(reg) irq); + irq &= 0x1ff; + if irq == 1023 { + panic!("irq_ack: got ID 1023!!!"); + } + irq as u32 + } + } + + unsafe fn irq_eoi(&mut self, irq: u32) { + unsafe { + asm!("msr icc_eoir1_el1, {}", in(reg) irq as usize); + } + } +} diff --git a/src/arch/aarch64/device/irqchip/irq_bcm2835.rs b/src/arch/aarch64/device/irqchip/irq_bcm2835.rs new file mode 100644 index 0000000000..59e69ec741 --- /dev/null +++ b/src/arch/aarch64/device/irqchip/irq_bcm2835.rs @@ -0,0 +1,299 @@ +use core::ptr::{read_volatile, write_volatile}; +use fdt::{node::FdtNode, Fdt}; + +use super::InterruptController; +use crate::{ + dtb::{ + get_interrupt, get_mmio_address, + irqchip::{InterruptHandler, IrqCell, IrqDesc, IRQ_CHIP}, + }, + sync::CleanLockToken, +}; +use syscall::{ + error::{Error, EINVAL}, + Result, +}; + +#[inline(always)] +fn ffs(num: u32) -> u32 { + let mut x = num; + if x == 0 { + return 0; + } + let mut r = 1; + if (x & 0xffff) == 0 { + x >>= 16; + r += 16; + } + if (x & 0xff) == 0 { + x >>= 8; + r += 8; + } + if (x & 0xf) == 0 { + x >>= 4; + r += 4; + } + if (x & 0x3) == 0 { + x >>= 2; + r += 2; + } + if (x & 0x1) == 0 { + r += 1; + } + + r +} + +const PENDING_0: u32 = 0x0; +const PENDING_1: u32 = 0x4; +const PENDING_2: u32 = 0x8; +const ENABLE_0: u32 = 0x18; +const ENABLE_1: u32 = 0x10; +const ENABLE_2: u32 = 0x14; +const DISABLE_0: u32 = 0x24; +const DISABLE_1: u32 = 0x1c; +const DISABLE_2: u32 = 0x20; + +pub struct Bcm2835ArmInterruptController { + pub address: usize, + pub irq_range: (usize, usize), +} + +impl Bcm2835ArmInterruptController { + pub fn new() -> Self { + Bcm2835ArmInterruptController { + address: 0, + irq_range: (0, 0), + } + } + pub fn parse(fdt: &Fdt) -> Result<(usize, usize, Option)> { + if let Some(node) = fdt.find_compatible(&["brcm,bcm2836-armctrl-ic"]) { + return unsafe { Bcm2835ArmInterruptController::parse_inner(fdt, &node) }; + } else { + return Err(Error::new(EINVAL)); + } + } + unsafe fn parse_inner(fdt: &Fdt, node: &FdtNode) -> Result<(usize, usize, Option)> { + unsafe { + //assert address_cells == 0x1, size_cells == 0x1 + let mem = node.reg().unwrap().nth(0).unwrap(); + let base = get_mmio_address(fdt, node, &mem).unwrap(); + let size = mem.size.unwrap() as u32; + let mut ret_virq = None; + + if let Some(interrupt_parent) = node.property("interrupt-parent") { + let phandle = interrupt_parent.as_usize().unwrap() as u32; + let irq = get_interrupt(fdt, node, 0).unwrap(); + let ic_idx = IRQ_CHIP.phandle_to_ic_idx(phandle).unwrap(); + //PHYS_NONSECURE_PPI only + let virq = IRQ_CHIP.irq_chip_list.chips[ic_idx] + .ic + .irq_xlate(irq) + .unwrap(); + info!( + "register bcm2835arm_ctrl as ic_idx {}'s child virq = {}", + ic_idx, virq + ); + ret_virq = Some(virq); + } + Ok((base as usize, size as usize, ret_virq)) + } + } + + unsafe fn init(&mut self) { + unsafe { + debug!("IRQ BCM2835 INIT"); + //disable all interrupt + self.write(DISABLE_0, 0xffff_ffff); + self.write(DISABLE_1, 0xffff_ffff); + self.write(DISABLE_2, 0xffff_ffff); + + debug!("IRQ BCM2835 END"); + } + } + + unsafe fn read(&self, reg: u32) -> u32 { + unsafe { + let val = read_volatile((self.address + reg as usize) as *const u32); + val + } + } + + unsafe fn write(&mut self, reg: u32, value: u32) { + unsafe { + write_volatile((self.address + reg as usize) as *mut u32, value); + } + } +} + +impl InterruptController for Bcm2835ArmInterruptController { + fn irq_init( + &mut self, + fdt_opt: Option<&Fdt>, + irq_desc: &mut [IrqDesc; 1024], + ic_idx: usize, + irq_idx: &mut usize, + ) -> Result<()> { + let (base, _size, _virq) = match Bcm2835ArmInterruptController::parse(fdt_opt.unwrap()) { + Ok((a, b, c)) => (a, b, c), + Err(_) => return Err(Error::new(EINVAL)), + }; + unsafe { + self.address = base + crate::PHYS_OFFSET; + + self.init(); + let idx = *irq_idx; + let cnt = 3 << 5; //3 * 32 irqs, basic == 8, reg1 = 32, reg2 = 32 + let mut i: usize = 0; + //only support linear irq map now. + while i < cnt && (idx + i < 1024) { + irq_desc[idx + i].basic.ic_idx = ic_idx; + irq_desc[idx + i].basic.ic_irq = i as u32; + irq_desc[idx + i].basic.used = true; + + i += 1; + } + + info!("bcm2835 irq_range = ({}, {})", idx, idx + cnt); + self.irq_range = (idx, idx + cnt); + *irq_idx = idx + cnt; + } + + Ok(()) + } + + fn irq_ack(&mut self) -> u32 { + //TODO: support smp self.read(LOCAL_IRQ_PENDING + 4 * cpu) + let sources = unsafe { self.read(PENDING_0) }; + let pending_num = ffs(sources) - 1; + let fast_irq = [ + 7 + 32, + 9 + 32, + 10 + 32, + 18 + 32, + 19 + 32, + 21 + 64, + 22 + 64, + 23 + 64, + 24 + 64, + 25 + 64, + 30 + 64, + ]; + + //fast irq + if pending_num >= 10 && pending_num <= 20 { + return fast_irq[(pending_num - 10) as usize]; + } + + let pending_num = ffs(sources & 0x3ff) - 1; + match pending_num { + num @ 0..=7 => return num, + 8 => { + let sources1 = unsafe { self.read(PENDING_1) }; + let irq_0_31 = ffs(sources1) - 1; + return irq_0_31 + 32; + } + 9 => { + let sources2 = unsafe { self.read(PENDING_2) }; + let irq_32_63 = ffs(sources2) - 1; + return irq_32_63 + 64; + } + num => { + error!( + "unexpected irq pending in BASIC PENDING: 0x{}, sources = 0x{:08x}", + num, sources + ); + return num; + } + } + } + + fn irq_eoi(&mut self, _irq_num: u32) {} + + fn irq_enable(&mut self, irq_num: u32) { + debug!("bcm2835 enable {} {}", irq_num, irq_num & 0x1f); + match irq_num { + num @ 0..=31 => { + let val = 1 << num; + unsafe { + self.write(ENABLE_0, val); + } + } + num @ 32..=63 => { + let val = 1 << (num & 0x1f); + unsafe { + self.write(ENABLE_1, val); + } + } + num @ 64..=95 => { + let val = 1 << (num & 0x1f); + unsafe { + self.write(ENABLE_2, val); + } + } + _ => return, + } + } + + fn irq_disable(&mut self, irq_num: u32) { + match irq_num { + num @ 0..=31 => { + let val = 1 << num; + unsafe { + self.write(DISABLE_0, val); + } + } + num @ 32..=63 => { + let val = 1 << (num & 0x1f); + unsafe { + self.write(DISABLE_1, val); + } + } + num @ 64..=95 => { + let val = 1 << (num & 0x1f); + unsafe { + self.write(DISABLE_2, val); + } + } + _ => return, + } + } + fn irq_xlate(&self, irq_data: IrqCell) -> Result { + //assert interrupt-cells == 0x2 + match irq_data { + IrqCell::L2(bank, irq) => { + //TODO: check bank && irq + let hwirq = (bank as usize) << 5 | (irq as usize); + let off = hwirq + self.irq_range.0; + Ok(off) + } + _ => Err(Error::new(EINVAL)), + } + } + + fn irq_to_virq(&self, hwirq: u32) -> Option { + if hwirq > 95 { + None + } else { + Some(self.irq_range.0 + hwirq as usize) + } + } +} + +impl InterruptHandler for Bcm2835ArmInterruptController { + fn irq_handler(&mut self, _irq: u32, token: &mut CleanLockToken) { + unsafe { + let irq = self.irq_ack(); + if let Some(virq) = self.irq_to_virq(irq) + && virq < 1024 + { + if let Some(handler) = &mut IRQ_CHIP.irq_desc[virq].handler { + handler.irq_handler(virq as u32, token); + } + } else { + error!("unexpected irq num {}", irq); + } + self.irq_eoi(irq); + } + } +} diff --git a/src/arch/aarch64/device/irqchip/irq_bcm2836.rs b/src/arch/aarch64/device/irqchip/irq_bcm2836.rs new file mode 100644 index 0000000000..8714cdad83 --- /dev/null +++ b/src/arch/aarch64/device/irqchip/irq_bcm2836.rs @@ -0,0 +1,231 @@ +use super::InterruptController; +use crate::{ + arch::device::{ROOT_IC_IDX, ROOT_IC_IDX_IS_SET}, + dtb::{ + get_mmio_address, + irqchip::{InterruptHandler, IrqCell, IrqDesc}, + }, + sync::CleanLockToken, +}; +use core::{ + arch::asm, + ptr::{read_volatile, write_volatile}, + sync::atomic::Ordering, +}; +use fdt::{node::FdtNode, Fdt}; +use syscall::{ + error::{Error, EINVAL}, + Result, +}; + +const LOCAL_CONTROL: u32 = 0x000; +const LOCAL_PRESCALER: u32 = 0x008; +const LOCAL_GPU_ROUTING: u32 = 0x00C; +const LOCAL_TIMER_INT_CONTROL0: u32 = 0x040; +const LOCAL_IRQ_PENDING: u32 = 0x060; + +const LOCAL_IRQ_CNTPNSIRQ: u32 = 0x1; +const LOCAL_IRQ_GPU_FAST: u32 = 0x8; +const LOCAL_IRQ_PMU_FAST: u32 = 0x9; +const LOCAL_IRQ_LAST: u32 = LOCAL_IRQ_PMU_FAST; + +#[inline(always)] +fn ffs(num: u32) -> u32 { + let mut x = num; + if x == 0 { + return 0; + } + let mut r = 1; + if (x & 0xffff) == 0 { + x >>= 16; + r += 16; + } + if (x & 0xff) == 0 { + x >>= 8; + r += 8; + } + if (x & 0xf) == 0 { + x >>= 4; + r += 4; + } + if (x & 0x3) == 0 { + x >>= 2; + r += 2; + } + if (x & 0x1) == 0 { + r += 1; + } + + r +} + +pub struct Bcm2836ArmInterruptController { + pub address: usize, + pub irq_range: (usize, usize), + pub active_cpu: u32, +} + +impl Bcm2836ArmInterruptController { + pub fn new() -> Self { + Bcm2836ArmInterruptController { + address: 0, + irq_range: (0, 0), + active_cpu: 0, + } + } + pub fn parse(fdt: &Fdt) -> Result<(usize, usize)> { + if let Some(node) = fdt.find_compatible(&["brcm,bcm2836-l1-intc"]) { + return Bcm2836ArmInterruptController::parse_inner(fdt, &node); + } else { + return Err(Error::new(EINVAL)); + } + } + fn parse_inner(fdt: &Fdt, node: &FdtNode) -> Result<(usize, usize)> { + //assert address_cells == 0x1, size_cells == 0x1 + let reg = node.reg().unwrap().nth(0).unwrap(); + let addr = get_mmio_address(fdt, node, ®).unwrap(); + + Ok((addr, reg.size.unwrap())) + } + + unsafe fn init(&mut self) { + unsafe { + debug!("IRQ BCM2836 INIT"); + //init local timer freq + self.write(LOCAL_CONTROL, 0x0); + self.write(LOCAL_PRESCALER, 0x8000_0000); + + //routing all irq to core + self.write(LOCAL_GPU_ROUTING, self.active_cpu); + debug!("routing all irq to core {}", self.active_cpu); + debug!("IRQ BCM2836 END"); + } + } + + unsafe fn read(&self, reg: u32) -> u32 { + unsafe { + let val = read_volatile((self.address + reg as usize) as *const u32); + val + } + } + + unsafe fn write(&mut self, reg: u32, value: u32) { + unsafe { + write_volatile((self.address + reg as usize) as *mut u32, value); + } + } +} + +impl InterruptHandler for Bcm2836ArmInterruptController { + fn irq_handler(&mut self, _irq: u32, token: &mut CleanLockToken) {} +} + +impl InterruptController for Bcm2836ArmInterruptController { + fn irq_init( + &mut self, + fdt_opt: Option<&Fdt>, + irq_desc: &mut [IrqDesc; 1024], + ic_idx: usize, + irq_idx: &mut usize, + ) -> Result<()> { + let (base, _size) = match Bcm2836ArmInterruptController::parse(fdt_opt.unwrap()) { + Ok((a, b)) => (a, b), + Err(_) => return Err(Error::new(EINVAL)), + }; + unsafe { + self.address = base + crate::PHYS_OFFSET; + let cpuid: usize; + asm!("mrs {}, mpidr_el1", out(reg) cpuid); + self.active_cpu = cpuid as u32 & 0x3; + + self.init(); + let idx = *irq_idx; + let cnt = LOCAL_IRQ_LAST as usize; + let mut i: usize = 0; + //only support linear irq map now. + while i < cnt && (idx + i < 1024) { + irq_desc[idx + i].basic.ic_idx = ic_idx; + irq_desc[idx + i].basic.ic_irq = i as u32; + irq_desc[idx + i].basic.used = true; + + i += 1; + } + + info!("bcm2836 irq_range = ({}, {})", idx, idx + cnt); + self.irq_range = (idx, idx + cnt); + *irq_idx = idx + cnt; + } + + //raspi 3b+ dts doesn't follow the rule to set root parent interrupt controller + //so we should set it manually. + ROOT_IC_IDX.store(ic_idx, Ordering::Relaxed); + ROOT_IC_IDX_IS_SET.store(1, Ordering::Relaxed); + + Ok(()) + } + + fn irq_ack(&mut self) -> u32 { + let cpuid: usize; + unsafe { + asm!("mrs {}, mpidr_el1", out(reg) cpuid); + } + let cpu = cpuid as u32 & 0x3; + let sources: u32 = unsafe { self.read(LOCAL_IRQ_PENDING + 4 * cpu) }; + ffs(sources) - 1 + } + + fn irq_eoi(&mut self, _irq_num: u32) {} + + fn irq_enable(&mut self, irq_num: u32) { + debug!("bcm2836 enable {}", irq_num); + match irq_num { + LOCAL_IRQ_CNTPNSIRQ => unsafe { + let cpuid: usize; + asm!("mrs {}, mpidr_el1", out(reg) cpuid); + let cpu = cpuid as u32 & 0x3; + let mut reg_val = self.read(LOCAL_TIMER_INT_CONTROL0 + 4 * cpu); + reg_val |= 0x2; + self.write(LOCAL_TIMER_INT_CONTROL0 + 4 * cpu, reg_val); + }, + LOCAL_IRQ_GPU_FAST => { + //GPU IRQ always enable + } + _ => { + //ignore + } + } + } + + fn irq_disable(&mut self, irq_num: u32) { + match irq_num { + LOCAL_IRQ_CNTPNSIRQ => unsafe { + let cpuid: usize; + asm!("mrs {}, mpidr_el1", out(reg) cpuid); + let cpu = cpuid as u32 & 0x3; + let mut reg_val = self.read(LOCAL_TIMER_INT_CONTROL0 + 4 * cpu); + reg_val &= !0x2; + self.write(LOCAL_TIMER_INT_CONTROL0 + 4 * cpu, reg_val); + }, + LOCAL_IRQ_GPU_FAST => { + //GPU IRQ always enable + } + _ => { + //ignore + } + } + } + fn irq_xlate(&self, irq_data: IrqCell) -> Result { + //assert interrupt-cells == 0x2 + match irq_data { + IrqCell::L2(irq, _) => Ok(irq as usize + self.irq_range.0), + _ => Err(Error::new(EINVAL)), + } + } + fn irq_to_virq(&self, hwirq: u32) -> Option { + if hwirq > LOCAL_IRQ_LAST { + None + } else { + Some(self.irq_range.0 + hwirq as usize) + } + } +} diff --git a/src/arch/aarch64/device/irqchip/mod.rs b/src/arch/aarch64/device/irqchip/mod.rs new file mode 100644 index 0000000000..442569e61b --- /dev/null +++ b/src/arch/aarch64/device/irqchip/mod.rs @@ -0,0 +1,41 @@ +use crate::dtb::irqchip::{InterruptController, IRQ_CHIP}; +use alloc::boxed::Box; +use fdt::{node::FdtNode, Fdt}; + +pub(crate) mod gic; +pub(crate) mod gicv3; +mod irq_bcm2835; +mod irq_bcm2836; +mod null; + +pub(crate) fn new_irqchip(ic_str: &str) -> Option> { + if ic_str.contains("arm,gic-v3") { + Some(Box::new(gicv3::GicV3::new())) + } else if ic_str.contains("arm,cortex-a15-gic") || ic_str.contains("arm,gic-400") { + Some(Box::new(gic::GenericInterruptController::new())) + } else if ic_str.contains("brcm,bcm2836-l1-intc") { + Some(Box::new(irq_bcm2836::Bcm2836ArmInterruptController::new())) + } else if ic_str.contains("brcm,bcm2836-armctrl-ic") { + Some(Box::new(irq_bcm2835::Bcm2835ArmInterruptController::new())) + } else { + warn!("no driver for interrupt controller {:?}", ic_str); + //TODO: return None and handle it properly + Some(Box::new(null::Null)) + } +} + +pub(crate) fn ic_for_chip(fdt: &Fdt, node: &FdtNode) -> Option { + if let Some(_) = node.property("interrupts-extended") { + error!("multi-parented device not supported"); + None + } else if let Some(irqc_phandle) = node + .property("interrupt-parent") + .or(fdt.root().property("interrupt-parent")) + .and_then(|f| f.as_usize()) + { + unsafe { IRQ_CHIP.phandle_to_ic_idx(irqc_phandle as u32) } + } else { + error!("no irq parent found"); + None + } +} diff --git a/src/arch/aarch64/device/irqchip/null.rs b/src/arch/aarch64/device/irqchip/null.rs new file mode 100644 index 0000000000..84fb9c53c4 --- /dev/null +++ b/src/arch/aarch64/device/irqchip/null.rs @@ -0,0 +1,41 @@ +use fdt::Fdt; +use syscall::{ + error::{Error, EINVAL}, + Result, +}; + +use super::InterruptController; +use crate::{ + dtb::irqchip::{InterruptHandler, IrqCell, IrqDesc}, + sync::CleanLockToken, +}; + +pub struct Null; + +impl InterruptHandler for Null { + fn irq_handler(&mut self, _irq: u32, token: &mut CleanLockToken) {} +} + +impl InterruptController for Null { + fn irq_init( + &mut self, + _fdt_opt: Option<&Fdt>, + _irq_desc: &mut [IrqDesc; 1024], + _ic_idx: usize, + _irq_idx: &mut usize, + ) -> Result<()> { + Ok(()) + } + fn irq_ack(&mut self) -> u32 { + unimplemented!() + } + fn irq_eoi(&mut self, _irq_num: u32) {} + fn irq_enable(&mut self, _irq_num: u32) {} + fn irq_disable(&mut self, _irq_num: u32) {} + fn irq_xlate(&self, _irq_data: IrqCell) -> Result { + Err(Error::new(EINVAL)) + } + fn irq_to_virq(&self, _hwirq: u32) -> Option { + None + } +} diff --git a/src/arch/aarch64/device/mod.rs b/src/arch/aarch64/device/mod.rs new file mode 100644 index 0000000000..cb99150ca4 --- /dev/null +++ b/src/arch/aarch64/device/mod.rs @@ -0,0 +1,60 @@ +use crate::info; +use core::sync::atomic::{AtomicUsize, Ordering}; +use fdt::Fdt; + +pub mod cpu; +pub mod generic_timer; +pub mod irqchip; +pub mod rtc; +pub mod serial; + +use crate::dtb::irqchip::IRQ_CHIP; +use irqchip::ic_for_chip; + +pub static ROOT_IC_IDX: AtomicUsize = AtomicUsize::new(0); +pub static ROOT_IC_IDX_IS_SET: AtomicUsize = AtomicUsize::new(0); + +unsafe fn init_root_ic(fdt: &Fdt) { + unsafe { + let is_set = ROOT_IC_IDX_IS_SET.load(Ordering::Relaxed); + if is_set != 0 { + let ic_idx = ROOT_IC_IDX.load(Ordering::Relaxed); + info!("Already selected {} as root ic", ic_idx); + return; + } + + let root_irqc_phandle = fdt + .root() + .property("interrupt-parent") + .unwrap() + .as_usize() + .unwrap(); + let ic_idx = IRQ_CHIP + .phandle_to_ic_idx(root_irqc_phandle as u32) + .unwrap(); + info!("select {} as root ic", ic_idx); + ROOT_IC_IDX.store(ic_idx, Ordering::Relaxed); + } +} + +pub unsafe fn init_devicetree(fdt: &Fdt) { + unsafe { + info!("IRQCHIP INIT"); + crate::dtb::irqchip::init(&fdt); + init_root_ic(&fdt); + info!("GIT INIT"); + generic_timer::init(fdt); + info!("SERIAL INIT"); + serial::init(fdt); + info!("RTC INIT"); + rtc::init(fdt); + } +} + +pub struct ArchPercpuMisc; + +impl ArchPercpuMisc { + pub const fn default() -> Self { + Self + } +} diff --git a/src/arch/aarch64/device/rtc.rs b/src/arch/aarch64/device/rtc.rs new file mode 100644 index 0000000000..89f7e9e608 --- /dev/null +++ b/src/arch/aarch64/device/rtc.rs @@ -0,0 +1,41 @@ +use crate::{dtb::get_mmio_address, sync::CleanLockToken, time}; +use core::ptr::read_volatile; + +static RTC_DR: usize = 0x000; + +pub unsafe fn init(fdt: &fdt::Fdt) { + if let Some(node) = fdt.find_compatible(&["arm,pl031"]) { + match node + .reg() + .and_then(|mut iter| iter.next()) + .and_then(|region| get_mmio_address(fdt, &node, ®ion)) + { + Some(phys) => { + let mut rtc = Pl031rtc { phys }; + info!("PL031 RTC at {:#x}", rtc.phys); + let mut token = unsafe { CleanLockToken::new() }; + *time::START.lock(token.token()) = (rtc.time() as u128) * time::NANOS_PER_SEC; + } + None => { + warn!("No PL031 RTC registers"); + } + } + } else { + warn!("No PL031 RTC found"); + } +} + +struct Pl031rtc { + pub phys: usize, +} + +impl Pl031rtc { + unsafe fn read(&self, reg: usize) -> u32 { + unsafe { read_volatile((crate::PHYS_OFFSET + self.phys + reg) as *const u32) } + } + + pub fn time(&mut self) -> u64 { + let seconds = unsafe { self.read(RTC_DR) } as u64; + seconds + } +} diff --git a/src/arch/aarch64/device/serial.rs b/src/arch/aarch64/device/serial.rs new file mode 100644 index 0000000000..c9555cca31 --- /dev/null +++ b/src/arch/aarch64/device/serial.rs @@ -0,0 +1,59 @@ +use alloc::boxed::Box; +use fdt::Fdt; + +pub use crate::dtb::serial::COM1; +use crate::{ + arch::device::irqchip::ic_for_chip, + dtb::{ + get_interrupt, + irqchip::{register_irq, InterruptHandler, IRQ_CHIP}, + }, + scheme::irq::irq_trigger, + sync::CleanLockToken, +}; + +pub struct Com1Irq {} + +impl InterruptHandler for Com1Irq { + fn irq_handler(&mut self, irq: u32, token: &mut CleanLockToken) { + COM1.lock().receive(token); + unsafe { + // FIXME add_irq accepts a u8 as irq number + // PercpuBlock::current().stats.add_irq(irq); + irq_trigger(irq.try_into().unwrap(), token); + IRQ_CHIP.irq_eoi(irq); + } + } +} + +pub unsafe fn init(fdt: &Fdt) { + unsafe { + //TODO: find actual serial device, not just any PL011 + if let Some(node) = fdt.find_compatible(&["arm,pl011"]) { + let irq = get_interrupt(fdt, &node, 0).unwrap(); + if let Some(ic_idx) = ic_for_chip(&fdt, &node) { + let virq = IRQ_CHIP.irq_chip_list.chips[ic_idx] + .ic + .irq_xlate(irq) + .unwrap(); + info!("serial_port virq = {}", virq); + register_irq(virq as u32, Box::new(Com1Irq {})); + IRQ_CHIP.irq_enable(virq as u32); + } else { + error!("serial port irq parent not found"); + } + } + COM1.lock().enable_irq(); + } +} + +pub unsafe fn init_acpi(irq: u32) { + unsafe { + //TODO: what should chip index be? + let virq = IRQ_CHIP.irq_chip_list.chips[0].ic.irq_to_virq(irq).unwrap(); + info!("serial_port virq = {}", virq); + register_irq(virq as u32, Box::new(Com1Irq {})); + IRQ_CHIP.irq_enable(virq as u32); + COM1.lock().enable_irq(); + } +} diff --git a/src/arch/aarch64/interrupt/exception.rs b/src/arch/aarch64/interrupt/exception.rs new file mode 100644 index 0000000000..cb4c6c4ccf --- /dev/null +++ b/src/arch/aarch64/interrupt/exception.rs @@ -0,0 +1,236 @@ +use ::syscall::Exception; +use rmm::VirtualAddress; + +use crate::{ + context::signal::excp_handler, + exception_stack, + memory::{ArchIntCtx, GenericPfFlags}, + sync::CleanLockToken, + syscall, +}; + +use super::InterruptStack; + +exception_stack!(synchronous_exception_at_el1_with_sp0, |stack| { + println!("Synchronous exception at EL1 with SP0"); + stack.trace(); + loop {} +}); + +fn exception_code(esr: usize) -> u8 { + ((esr >> 26) & 0x3f) as u8 +} +fn iss(esr: usize) -> u32 { + (esr & 0x01ff_ffff) as u32 +} + +unsafe fn far_el1() -> usize { + unsafe { + let ret: usize; + core::arch::asm!("mrs {}, far_el1", out(reg) ret); + ret + } +} + +unsafe fn instr_data_abort_inner( + stack: &mut InterruptStack, + from_user: bool, + instr_not_data: bool, + _from: &str, +) -> bool { + unsafe { + let iss = iss(stack.iret.esr_el1); + let fsc = iss & 0x3F; + //dbg!(fsc); + + let was_translation_fault = fsc >= 0b000100 && fsc <= 0b000111; + //let was_permission_fault = fsc >= 0b001101 && fsc <= 0b001111; + let write_not_read_if_data = iss & (1 << 6) != 0; + + let mut flags = GenericPfFlags::empty(); + flags.set(GenericPfFlags::PRESENT, !was_translation_fault); + + // TODO: RMW instructions may "involve" writing to (possibly invalid) memory, but AArch64 + // doesn't appear to require that flag to be set if the read alone would trigger a fault. + flags.set( + GenericPfFlags::INVOLVED_WRITE, + write_not_read_if_data && !instr_not_data, + ); + flags.set(GenericPfFlags::INSTR_NOT_DATA, instr_not_data); + flags.set(GenericPfFlags::USER_NOT_SUPERVISOR, from_user); + + let faulting_addr = VirtualAddress::new(far_el1()); + //dbg!(faulting_addr, flags, from); + + crate::memory::page_fault_handler(stack, flags, faulting_addr).is_ok() + } +} + +unsafe fn cntfrq_el0() -> usize { + unsafe { + let ret: usize; + core::arch::asm!("mrs {}, cntfrq_el0", out(reg) ret); + ret + } +} + +unsafe fn cntpct_el0() -> usize { + unsafe { + let ret: usize; + core::arch::asm!("mrs {}, cntpct_el0", out(reg) ret); + ret + } +} + +unsafe fn cntvct_el0() -> usize { + unsafe { + let ret: usize; + core::arch::asm!("mrs {}, cntvct_el0", out(reg) ret); + ret + } +} + +unsafe fn instr_trapped_msr_mrs_inner( + stack: &mut InterruptStack, + _from_user: bool, + _instr_not_data: bool, + _from: &str, +) -> bool { + unsafe { + let iss = iss(stack.iret.esr_el1); + // let res0 = (iss & 0x1C0_0000) >> 22; + let op0 = (iss & 0x030_0000) >> 20; + let op2 = (iss & 0x00e_0000) >> 17; + let op1 = (iss & 0x001_c000) >> 14; + let crn = (iss & 0x000_3c00) >> 10; + let rt = (iss & 0x000_03e0) >> 5; + let crm = (iss & 0x000_001e) >> 1; + let dir = iss & 0x000_0001; + + /* + print!("iss=0x{:x}, res0=0b{:03b}, op0=0b{:02b}\n + op2=0b{:03b}, op1=0b{:03b}, crn=0b{:04b}\n + rt=0b{:05b}, crm=0b{:04b}, dir=0b{:b}\n", + iss, res0, op0, op2, op1, crn, rt, crm, dir); + */ + + match (op0, op1, crn, crm, op2, dir) { + //MRS , CNTFRQ_EL0 + (0b11, 0b011, 0b1110, 0b0000, 0b000, 0b1) => { + let reg_val = cntfrq_el0(); + stack.store_reg(rt as usize, reg_val); + //skip faulting instruction, A64 instructions are always 32-bits + stack.iret.elr_el1 += 4; + return true; + } + //MRS , CNTPCT_EL0 + (0b11, 0b011, 0b1110, 0b0000, 0b001, 0b1) => { + let reg_val = cntpct_el0(); + stack.store_reg(rt as usize, reg_val); + //skip faulting instruction, A64 instructions are always 32-bits + stack.iret.elr_el1 += 4; + return true; + } + //MRS , CNTVCT_EL0 + (0b11, 0b011, 0b1110, 0b0000, 0b010, 0b1) => { + let reg_val = cntvct_el0(); + stack.store_reg(rt as usize, reg_val); + //skip faulting instruction, A64 instructions are always 32-bits + stack.iret.elr_el1 += 4; + return true; + } + _ => {} + } + + false + } +} + +exception_stack!(synchronous_exception_at_el1_with_spx, |stack| { + unsafe { + if !pf_inner( + stack, + exception_code(stack.iret.esr_el1), + "sync_exc_el1_spx", + ) { + println!("Synchronous exception at EL1 with SPx"); + if exception_code(stack.iret.esr_el1) == 0b100101 { + let far_el1 = far_el1(); + println!("FAR_EL1 = 0x{:08x}", far_el1); + } else if exception_code(stack.iret.esr_el1) == 0b100100 { + let far_el1 = far_el1(); + println!("USER FAR_EL1 = 0x{:08x}", far_el1); + } + stack.trace(); + loop {} + } + } +}); +unsafe fn pf_inner(stack: &mut InterruptStack, ty: u8, from: &str) -> bool { + unsafe { + match ty { + // "Data Abort taken from a lower Exception level" + 0b100100 => instr_data_abort_inner(stack, true, false, from), + // "Data Abort taken without a change in Exception level" + 0b100101 => instr_data_abort_inner(stack, false, false, from), + // "Instruction Abort taken from a lower Exception level" + 0b100000 => instr_data_abort_inner(stack, true, true, from), + // "Instruction Abort taken without a change in Exception level" + 0b100001 => instr_data_abort_inner(stack, false, true, from), + // "Trapped MSR, MRS or System instruction execution in AArch64 state" + 0b011000 => instr_trapped_msr_mrs_inner(stack, true, true, from), + + _ => return false, + } + } +} + +exception_stack!(synchronous_exception_at_el0, |stack| { + unsafe { + match exception_code(stack.iret.esr_el1) { + 0b010101 => { + let scratch = &stack.scratch; + let mut token = CleanLockToken::new(); + let ret = syscall::syscall( + scratch.x8, scratch.x0, scratch.x1, scratch.x2, scratch.x3, scratch.x4, + scratch.x5, &mut token, + ); + stack.scratch.x0 = ret; + } + + ty => { + if !pf_inner(stack, ty as u8, "sync_exc_el0") { + error!( + "FATAL: Not an SVC induced synchronous exception (ty={:b})", + ty + ); + println!("FAR_EL1: {:#0x}", far_el1()); + //crate::debugger::debugger(None); + stack.trace(); + excp_handler(Exception { + kind: 0, // TODO + }); + } + } + } + } +}); + +exception_stack!(unhandled_exception, |stack| { + println!("Unhandled exception"); + stack.trace(); + loop {} +}); + +impl ArchIntCtx for InterruptStack { + fn ip(&self) -> usize { + self.iret.elr_el1 + } + fn recover_and_efault(&mut self) { + // Set the return value to nonzero to indicate usercopy failure (EFAULT), and emulate the + // return instruction by setting the return pointer to the saved LR value. + + self.iret.elr_el1 = self.preserved.x30; + self.scratch.x0 = 1; + } +} diff --git a/src/arch/aarch64/interrupt/handler.rs b/src/arch/aarch64/interrupt/handler.rs new file mode 100644 index 0000000000..641babbfb7 --- /dev/null +++ b/src/arch/aarch64/interrupt/handler.rs @@ -0,0 +1,420 @@ +use crate::{panic, syscall::IntRegisters}; + +#[derive(Default)] +#[repr(C, packed)] +pub struct ScratchRegisters { + pub x0: usize, + pub x1: usize, + pub x2: usize, + pub x3: usize, + pub x4: usize, + pub x5: usize, + pub x6: usize, + pub x7: usize, + pub x8: usize, + pub x9: usize, + pub x10: usize, + pub x11: usize, + pub x12: usize, + pub x13: usize, + pub x14: usize, + pub x15: usize, + pub x16: usize, + pub x17: usize, + pub x18: usize, + pub _padding: usize, +} + +impl ScratchRegisters { + pub fn dump(&self) { + println!("X0: {:>016X}", { self.x0 }); + println!("X1: {:>016X}", { self.x1 }); + println!("X2: {:>016X}", { self.x2 }); + println!("X3: {:>016X}", { self.x3 }); + println!("X4: {:>016X}", { self.x4 }); + println!("X5: {:>016X}", { self.x5 }); + println!("X6: {:>016X}", { self.x6 }); + println!("X7: {:>016X}", { self.x7 }); + println!("X8: {:>016X}", { self.x8 }); + println!("X9: {:>016X}", { self.x9 }); + println!("X10: {:>016X}", { self.x10 }); + println!("X11: {:>016X}", { self.x11 }); + println!("X12: {:>016X}", { self.x12 }); + println!("X13: {:>016X}", { self.x13 }); + println!("X14: {:>016X}", { self.x14 }); + println!("X15: {:>016X}", { self.x15 }); + println!("X16: {:>016X}", { self.x16 }); + println!("X17: {:>016X}", { self.x17 }); + println!("X18: {:>016X}", { self.x18 }); + } +} + +#[derive(Default)] +#[repr(C, packed)] +pub struct PreservedRegisters { + //TODO: is X30 a preserved register? + pub x19: usize, + pub x20: usize, + pub x21: usize, + pub x22: usize, + pub x23: usize, + pub x24: usize, + pub x25: usize, + pub x26: usize, + pub x27: usize, + pub x28: usize, + pub x29: usize, + pub x30: usize, +} + +impl PreservedRegisters { + pub fn dump(&self) { + println!("X19: {:>016X}", { self.x19 }); + println!("X20: {:>016X}", { self.x20 }); + println!("X21: {:>016X}", { self.x21 }); + println!("X22: {:>016X}", { self.x22 }); + println!("X23: {:>016X}", { self.x23 }); + println!("X24: {:>016X}", { self.x24 }); + println!("X25: {:>016X}", { self.x25 }); + println!("X26: {:>016X}", { self.x26 }); + println!("X27: {:>016X}", { self.x27 }); + println!("X28: {:>016X}", { self.x28 }); + println!("X29: {:>016X}", { self.x29 }); + println!("X30: {:>016X}", { self.x30 }); + } +} + +#[derive(Default)] +#[repr(C, packed)] +pub struct IretRegisters { + // occurred + // The exception vector disambiguates at which EL the interrupt + pub sp_el0: usize, // Shouldn't be used if interrupt occurred at EL1 + pub esr_el1: usize, + pub spsr_el1: usize, + pub elr_el1: usize, +} + +impl IretRegisters { + pub fn dump(&self) { + println!("ELR_EL1: {:>016X}", { self.elr_el1 }); + println!("SPSR_EL1: {:>016X}", { self.spsr_el1 }); + println!("ESR_EL1: {:>016X}", { self.esr_el1 }); + println!("SP_EL0: {:>016X}", { self.sp_el0 }); + } +} + +#[derive(Default)] +#[repr(C, packed)] +pub struct InterruptStack { + pub iret: IretRegisters, + pub scratch: ScratchRegisters, + pub preserved: PreservedRegisters, +} + +impl InterruptStack { + pub fn init(&mut self) {} + pub fn frame_pointer(&self) -> usize { + self.preserved.x29 + } + pub fn stack_pointer(&self) -> usize { + self.iret.sp_el0 + } + pub fn set_stack_pointer(&mut self, sp: usize) { + self.iret.sp_el0 = sp; + } + pub fn sig_archdep_reg(&self) -> usize { + self.scratch.x0 + } + pub fn set_instr_pointer(&mut self, ip: usize) { + self.iret.elr_el1 = ip; + } + pub fn instr_pointer(&self) -> usize { + self.iret.elr_el1 + } + pub fn set_arg1(&mut self, arg_opt: Option) { + if let Some(arg) = arg_opt { + self.scratch.x1 = arg; + } + } + pub fn dump(&self) { + self.iret.dump(); + self.scratch.dump(); + self.preserved.dump(); + } + pub fn trace(&self) { + self.dump(); + unsafe { + panic::user_stack_trace(&self); + panic::stack_trace(); + } + } + + /// Saves all registers to a struct used by the proc: + /// scheme to read/write registers. + pub fn save(&self, all: &mut IntRegisters) { + /*TODO: aarch64 registers + all.elr_el1 = self.iret.elr_el1; + all.spsr_el1 = self.iret.spsr_el1; + all.esr_el1 = self.iret.esr_el1; + all.sp_el0 = self.iret.sp_el0; + all.padding = 0; + */ + all.x30 = self.preserved.x30; + all.x29 = self.preserved.x29; + all.x28 = self.preserved.x28; + all.x27 = self.preserved.x27; + all.x26 = self.preserved.x26; + all.x25 = self.preserved.x25; + all.x24 = self.preserved.x24; + all.x23 = self.preserved.x23; + all.x22 = self.preserved.x22; + all.x21 = self.preserved.x21; + all.x20 = self.preserved.x20; + all.x19 = self.preserved.x19; + all.x18 = self.scratch.x18; + all.x17 = self.scratch.x17; + all.x16 = self.scratch.x16; + all.x15 = self.scratch.x15; + all.x14 = self.scratch.x14; + all.x13 = self.scratch.x13; + all.x12 = self.scratch.x12; + all.x11 = self.scratch.x11; + all.x10 = self.scratch.x10; + all.x9 = self.scratch.x9; + all.x8 = self.scratch.x8; + all.x7 = self.scratch.x7; + all.x6 = self.scratch.x6; + all.x5 = self.scratch.x5; + all.x4 = self.scratch.x4; + all.x3 = self.scratch.x3; + all.x2 = self.scratch.x2; + all.x1 = self.scratch.x1; + all.x0 = self.scratch.x0; + } + + /// Loads all registers from a struct used by the proc: + /// scheme to read/write registers. + pub fn load(&mut self, all: &IntRegisters) { + /*TODO: aarch64 registers + self.iret.elr_el1 = all.elr_el1; + self.iret.spsr_el1 = all.spsr_el1; + self.iret.esr_el1 = all.esr_el1; + self.iret.sp_el0 = all.sp_el0; + */ + self.preserved.x30 = all.x30; + self.preserved.x29 = all.x29; + self.preserved.x28 = all.x28; + self.preserved.x27 = all.x27; + self.preserved.x26 = all.x26; + self.preserved.x25 = all.x25; + self.preserved.x24 = all.x24; + self.preserved.x23 = all.x23; + self.preserved.x22 = all.x22; + self.preserved.x21 = all.x21; + self.preserved.x20 = all.x20; + self.preserved.x19 = all.x19; + self.scratch.x18 = all.x18; + self.scratch.x17 = all.x17; + self.scratch.x16 = all.x16; + self.scratch.x15 = all.x15; + self.scratch.x14 = all.x14; + self.scratch.x13 = all.x13; + self.scratch.x12 = all.x12; + self.scratch.x11 = all.x11; + self.scratch.x10 = all.x10; + self.scratch.x9 = all.x9; + self.scratch.x8 = all.x8; + self.scratch.x7 = all.x7; + self.scratch.x6 = all.x6; + self.scratch.x5 = all.x5; + self.scratch.x4 = all.x4; + self.scratch.x3 = all.x3; + self.scratch.x2 = all.x2; + self.scratch.x1 = all.x1; + self.scratch.x0 = all.x0; + } + + /// Store a specific generic registers + pub fn store_reg(&mut self, idx: usize, val: usize) { + match idx { + 0 => self.scratch.x0 = val, + 1 => self.scratch.x1 = val, + 2 => self.scratch.x2 = val, + 3 => self.scratch.x3 = val, + 4 => self.scratch.x4 = val, + 5 => self.scratch.x5 = val, + 6 => self.scratch.x6 = val, + 7 => self.scratch.x7 = val, + 8 => self.scratch.x8 = val, + 9 => self.scratch.x9 = val, + 10 => self.scratch.x10 = val, + 11 => self.scratch.x11 = val, + 12 => self.scratch.x12 = val, + 13 => self.scratch.x13 = val, + 14 => self.scratch.x14 = val, + 15 => self.scratch.x15 = val, + 16 => self.scratch.x16 = val, + 17 => self.scratch.x17 = val, + 18 => self.scratch.x18 = val, + 19 => self.preserved.x19 = val, + 20 => self.preserved.x20 = val, + 21 => self.preserved.x21 = val, + 22 => self.preserved.x22 = val, + 23 => self.preserved.x23 = val, + 24 => self.preserved.x24 = val, + 25 => self.preserved.x25 = val, + 26 => self.preserved.x26 = val, + 27 => self.preserved.x27 = val, + 28 => self.preserved.x28 = val, + 29 => self.preserved.x29 = val, + 30 => self.preserved.x30 = val, + _ => {} + } + } + + //TODO + pub fn set_singlestep(&mut self, _singlestep: bool) {} +} + +#[macro_export] +macro_rules! push_scratch { + () => { + " + // Push scratch registers + str x18, [sp, #-16]! + stp x16, x17, [sp, #-16]! + stp x14, x15, [sp, #-16]! + stp x12, x13, [sp, #-16]! + stp x10, x11, [sp, #-16]! + stp x8, x9, [sp, #-16]! + stp x6, x7, [sp, #-16]! + stp x4, x5, [sp, #-16]! + stp x2, x3, [sp, #-16]! + stp x0, x1, [sp, #-16]! + " + }; +} + +#[macro_export] +macro_rules! pop_scratch { + () => { + " + // Pop scratch registers + ldp x0, x1, [sp], #16 + ldp x2, x3, [sp], #16 + ldp x4, x5, [sp], #16 + ldp x6, x7, [sp], #16 + ldp x8, x9, [sp], #16 + ldp x10, x11, [sp], #16 + ldp x12, x13, [sp], #16 + ldp x14, x15, [sp], #16 + ldp x16, x17, [sp], #16 + ldr x18, [sp], #16 + " + }; +} + +#[macro_export] +macro_rules! push_preserved { + () => { + " + // Push preserved registers + stp x29, x30, [sp, #-16]! + stp x27, x28, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x19, x20, [sp, #-16]! + " + }; +} + +#[macro_export] +macro_rules! pop_preserved { + () => { + " + // Pop preserved registers + ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x27, x28, [sp], #16 + ldp x29, x30, [sp], #16 + " + }; +} + +#[macro_export] +macro_rules! push_special { + () => { + " + mrs x14, spsr_el1 + mrs x15, elr_el1 + stp x14, x15, [sp, #-16]! + + mrs x14, sp_el0 + mrs x15, esr_el1 + stp x14, x15, [sp, #-16]! + " + }; +} + +#[macro_export] +macro_rules! pop_special { + () => { + " + ldp x14, x15, [sp], 16 + msr esr_el1, x15 + msr sp_el0, x14 + + ldp x14, x15, [sp], 16 + msr elr_el1, x15 + msr spsr_el1, x14 + " + }; +} + +#[macro_export] +macro_rules! exception_stack { + ($name:ident, |$stack:ident| $code:block) => { + #[unsafe(naked)] + #[unsafe(no_mangle)] + pub unsafe extern "C" fn $name(stack: &mut $crate::arch::aarch64::interrupt::InterruptStack) { + unsafe extern "C" fn inner($stack: &mut $crate::arch::aarch64::interrupt::InterruptStack) { + $code + } + core::arch::naked_asm!( + // Backup all userspace registers to stack + push_preserved!(), + push_scratch!(), + push_special!(), + + // Call inner function with pointer to stack + "mov x29, sp", + "mov x0, sp", + "bl {}", + + // Restore all userspace registers + pop_special!(), + pop_scratch!(), + pop_preserved!(), + + "eret", + + sym inner, + ); + } + }; +} +#[unsafe(naked)] +pub unsafe extern "C" fn enter_usermode() -> ! { + core::arch::naked_asm!( + "blr x28", + // Restore all userspace registers + pop_special!(), + pop_scratch!(), + pop_preserved!(), + "eret", + ); +} diff --git a/src/arch/aarch64/interrupt/irq.rs b/src/arch/aarch64/interrupt/irq.rs new file mode 100644 index 0000000000..a9a036aa8c --- /dev/null +++ b/src/arch/aarch64/interrupt/irq.rs @@ -0,0 +1,56 @@ +use crate::{arch::device::ROOT_IC_IDX, dtb::irqchip::IRQ_CHIP, sync::CleanLockToken}; +use core::sync::atomic::Ordering; + +// use crate::percpu::PercpuBlock; + +unsafe fn irq_ack() -> (u32, Option) { + unsafe { + let ic = &mut IRQ_CHIP.irq_chip_list.chips[ROOT_IC_IDX.load(Ordering::Relaxed)].ic; + let irq = ic.irq_ack(); + (irq, ic.irq_to_virq(irq)) + } +} + +exception_stack!(irq_at_el0, |_stack| { + unsafe { + let mut token = CleanLockToken::new(); + let (irq, virq) = irq_ack(); + if let Some(virq) = virq + && virq < 1024 + { + IRQ_CHIP.trigger_virq(virq as u32, &mut token); + } else { + println!("unexpected irq num {}", irq); + } + } +}); + +exception_stack!(irq_at_el1, |_stack| { + unsafe { + let mut token = CleanLockToken::new(); + let (irq, virq) = irq_ack(); + if let Some(virq) = virq + && virq < 1024 + { + IRQ_CHIP.trigger_virq(virq as u32, &mut token); + } else { + println!("unexpected irq num {}", irq); + } + } +}); + +/* +pub unsafe fn irq_handler_gentimer(irq: u32) { + GENTIMER.clear_irq(); + { + *time::OFFSET.lock() += GENTIMER.clk_freq as u128; + } + + timeout::trigger(); + + context::switch::tick(); + + trigger(irq); + GENTIMER.reload_count(); +} +*/ diff --git a/src/arch/aarch64/interrupt/mod.rs b/src/arch/aarch64/interrupt/mod.rs new file mode 100644 index 0000000000..13d7b8ff6d --- /dev/null +++ b/src/arch/aarch64/interrupt/mod.rs @@ -0,0 +1,49 @@ +//! Interrupt instructions + +use core::arch::asm; + +#[macro_use] +pub mod handler; + +pub mod exception; +pub mod irq; +pub mod syscall; +pub mod trace; + +pub use self::handler::InterruptStack; + +/// Clear interrupts +#[inline(always)] +pub unsafe fn disable() { + unsafe { + asm!("msr daifset, #2"); + } +} + +/// Set interrupts and halt +/// This will atomically wait for the next interrupt +/// Performing enable followed by halt is not guaranteed to be atomic, use this instead! +#[inline(always)] +pub unsafe fn enable_and_halt() { + unsafe { + asm!("wfi", "msr daifclr, #2", "nop"); + } +} + +/// Set interrupts and nop +/// This will enable interrupts and allow the IF flag to be processed +/// Simply enabling interrupts does not gurantee that they will trigger, use this instead! +#[inline(always)] +pub unsafe fn enable_and_nop() { + unsafe { + asm!("msr daifclr, #2", "nop"); + } +} + +/// Halt instruction +#[inline(always)] +pub unsafe fn halt() { + unsafe { + asm!("wfi"); + } +} diff --git a/src/arch/aarch64/interrupt/syscall.rs b/src/arch/aarch64/interrupt/syscall.rs new file mode 100644 index 0000000000..2b17a71182 --- /dev/null +++ b/src/arch/aarch64/interrupt/syscall.rs @@ -0,0 +1,49 @@ +#[unsafe(no_mangle)] +pub unsafe extern "C" fn do_exception_unhandled() {} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn do_exception_synchronous() {} + +#[allow(dead_code)] +#[repr(C, packed)] +pub struct SyscallStack { + pub elr_el1: usize, + pub padding: usize, + pub tpidr: usize, + pub tpidrro: usize, + pub rflags: usize, + pub esr: usize, + pub sp: usize, + pub lr: usize, + pub fp: usize, + pub x28: usize, + pub x27: usize, + pub x26: usize, + pub x25: usize, + pub x24: usize, + pub x23: usize, + pub x22: usize, + pub x21: usize, + pub x20: usize, + pub x19: usize, + pub x18: usize, + pub x17: usize, + pub x16: usize, + pub x15: usize, + pub x14: usize, + pub x13: usize, + pub x12: usize, + pub x11: usize, + pub x10: usize, + pub x9: usize, + pub x8: usize, + pub x7: usize, + pub x6: usize, + pub x5: usize, + pub x4: usize, + pub x3: usize, + pub x2: usize, + pub x1: usize, + pub x0: usize, +} +pub use super::handler::enter_usermode; diff --git a/src/arch/aarch64/interrupt/trace.rs b/src/arch/aarch64/interrupt/trace.rs new file mode 100644 index 0000000000..4f7ea6843b --- /dev/null +++ b/src/arch/aarch64/interrupt/trace.rs @@ -0,0 +1,32 @@ +use core::arch::asm; + +pub struct StackTrace { + pub fp: usize, + pub pc_ptr: *const usize, +} + +impl StackTrace { + #[inline(always)] + pub unsafe fn start() -> Option { + unsafe { + let fp: usize; + asm!("mov {}, fp", out(reg) fp); + let pc_ptr = fp.checked_add(size_of::())?; + Some(StackTrace { + fp, + pc_ptr: pc_ptr as *const usize, + }) + } + } + + pub unsafe fn next(self) -> Option { + unsafe { + let fp = *(self.fp as *const usize); + let pc_ptr = fp.checked_add(size_of::())?; + Some(StackTrace { + fp: fp, + pc_ptr: pc_ptr as *const usize, + }) + } + } +} diff --git a/src/arch/aarch64/ipi.rs b/src/arch/aarch64/ipi.rs new file mode 100644 index 0000000000..624209b175 --- /dev/null +++ b/src/arch/aarch64/ipi.rs @@ -0,0 +1,30 @@ +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum IpiKind { + Wakeup = 0x40, + Tlb = 0x41, +} + +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum IpiTarget { + Other = 3, +} + +#[inline(always)] +pub fn ipi(_kind: IpiKind, _target: IpiTarget) { + if cfg!(not(feature = "multi_core")) { + return; + } + + // FIXME implement +} + +#[inline(always)] +pub fn ipi_single(_kind: IpiKind, _target: &crate::percpu::PercpuBlock) { + if cfg!(not(feature = "multi_core")) { + return; + } + + // FIXME implement +} diff --git a/src/arch/aarch64/misc.rs b/src/arch/aarch64/misc.rs new file mode 100644 index 0000000000..ac54d767f2 --- /dev/null +++ b/src/arch/aarch64/misc.rs @@ -0,0 +1,23 @@ +use crate::{ + cpu_set::LogicalCpuId, + memory::{RmmA, RmmArch}, + percpu::PercpuBlock, +}; + +impl PercpuBlock { + pub fn current() -> &'static Self { + unsafe { &*(crate::arch::device::cpu::registers::control_regs::tpidr_el1() as *const Self) } + } +} + +#[cold] +pub unsafe fn init(cpu_id: LogicalCpuId) { + unsafe { + let frame = crate::memory::allocate_frame().expect("failed to allocate percpu memory"); + let virt = RmmA::phys_to_virt(frame.base()).data() as *mut PercpuBlock; + + virt.write(PercpuBlock::init(cpu_id)); + + crate::arch::device::cpu::registers::control_regs::tpidr_el1_write(virt as u64); + } +} diff --git a/src/arch/aarch64/mod.rs b/src/arch/aarch64/mod.rs new file mode 100644 index 0000000000..2248c85a30 --- /dev/null +++ b/src/arch/aarch64/mod.rs @@ -0,0 +1,71 @@ +/// Constants like memory locations +pub mod consts; + +/// Debugging support +pub mod debug; + +/// Devices +pub mod device; + +/// Interrupt instructions +pub mod interrupt; + +/// Inter-processor interrupts +pub mod ipi; + +/// Miscellaneous +pub mod misc; + +/// Paging +pub mod paging; + +/// Initialization and start function +pub mod start; + +/// Stop function +pub mod stop; + +// Interrupt vectors +pub mod vectors; + +pub mod time; + +pub use ::rmm::aarch64::AArch64Arch as CurrentRmmArch; + +pub use arch_copy_to_user as arch_copy_from_user; + +#[unsafe(naked)] +pub unsafe extern "C" fn arch_copy_to_user(dst: usize, src: usize, len: usize) -> u8 { + // x0, x1, x2 + core::arch::naked_asm!( + " + .global __usercopy_start + __usercopy_start: + mov x4, x0 + mov x0, 0 + 2: + cmp x2, 0 + b.eq 3f + + ldrb w3, [x1] + strb w3, [x4] + + add x4, x4, 1 + add x1, x1, 1 + sub x2, x2, 1 + + b 2b + 3: + ret + .global __usercopy_end + __usercopy_end: + " + ); +} + +pub const KFX_SIZE: usize = 1024; + +// This function exists as the KFX size is dynamic on x86_64. +pub fn kfx_size() -> usize { + KFX_SIZE +} diff --git a/src/arch/aarch64/paging.rs b/src/arch/aarch64/paging.rs new file mode 100644 index 0000000000..a28de73e64 --- /dev/null +++ b/src/arch/aarch64/paging.rs @@ -0,0 +1,7 @@ +/// Initialize MAIR +#[cold] +pub unsafe fn init() { + unsafe { + rmm::aarch64::init_mair(); + } +} diff --git a/src/arch/aarch64/start.rs b/src/arch/aarch64/start.rs new file mode 100644 index 0000000000..e1c8cfb4ae --- /dev/null +++ b/src/arch/aarch64/start.rs @@ -0,0 +1,148 @@ +//! This function is where the kernel sets up IRQ handlers +//! It is incredibly unsafe, and should be minimal in nature +//! It must create the IDT with the correct entries, those entries are +//! defined in other files inside of the `arch` module +use core::{arch::naked_asm, cell::SyncUnsafeCell, slice}; + +use fdt::Fdt; + +use crate::{ + allocator, + arch::{device, paging}, + devices::graphical_debug, + dtb, + startup::KernelArgs, +}; + +/// Test of zero values in BSS. +static mut BSS_TEST_ZERO: usize = 0; +/// Test of non-zero values in data. +static mut DATA_TEST_NONZERO: usize = 0xFFFF_FFFF_FFFF_FFFF; + +#[repr(C, align(16))] +struct StackAlign(T); + +static STACK: SyncUnsafeCell> = + SyncUnsafeCell::new(StackAlign([0; 128 * 1024])); + +// FIXME use extern "custom" +#[unsafe(naked)] +#[unsafe(no_mangle)] +extern "C" fn kstart() { + naked_asm!(" + // BSS should already be zero + adrp x9, {bss_test_zero} + ldr x9, [x9, :lo12:{bss_test_zero}] + cbnz x9, .Lkstart_crash + adrp x9, {data_test_nonzero} + ldr x9, [x9, :lo12:{data_test_nonzero}] + cbz x9, .Lkstart_crash + + adrp x1, {stack} + add x1, x1, :lo12:{stack} + mov x2, {stack_size}-16 + add sp, x1, x2 + + // Setup interrupt handlers + ldr x9, =exception_vector_base + msr vbar_el1, x9 + + mov lr, 0 + b {start} + + .Lkstart_crash: + mov x9, 0 + br x9 + ", + bss_test_zero = sym BSS_TEST_ZERO, + data_test_nonzero = sym DATA_TEST_NONZERO, + stack = sym STACK, + stack_size = const size_of_val(&STACK), + start = sym start, + ); +} + +/// The entry to Rust, all things must be initialized +unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! { + unsafe { + let bootstrap = { + let args = args_ptr.read(); + + // Set up graphical debug + graphical_debug::init(args.env()); + + // Get hardware descriptor data + //TODO: use env {DTB,RSDT}_{BASE,SIZE}? + let hwdesc_data = if args.hwdesc_base != 0 { + Some(slice::from_raw_parts( + (crate::PHYS_OFFSET + args.hwdesc_base as usize) as *const u8, + args.hwdesc_size as usize, + )) + } else { + None + }; + + let dtb_res = hwdesc_data + .ok_or(fdt::FdtError::BadPtr) + .and_then(|data| Fdt::new(data)); + + // Try to find serial port prior to logging + if let Ok(dtb) = &dtb_res { + dtb::serial::init_early(dtb); + } + + info!("Redox OS starting..."); + args.print(); + + // Initialize RMM + crate::startup::memory::init(&args, None, None); + + // Initialize paging + paging::init(); + + crate::arch::misc::init(crate::cpu_set::LogicalCpuId::new(0)); + + // Setup kernel heap + allocator::init(); + + // Activate memory logging + crate::log::init(); + + // Initialize devices + match dtb_res { + Ok(dtb) => { + dtb::init(hwdesc_data.map(|slice| (slice.as_ptr() as usize, slice.len()))); + device::init_devicetree(&dtb); + } + Err(err) => { + dtb::init(None); + warn!("failed to parse DTB: {}", err); + + #[cfg(feature = "acpi")] + { + crate::acpi::init(args.acpi_rsdp()); + } + } + } + + args.bootstrap() + }; + + crate::startup::kmain(bootstrap); + } +} + +#[repr(C, packed)] +#[allow(unused)] +pub struct KernelArgsAp { + cpu_id: u64, + page_table: u64, + stack_start: u64, + stack_end: u64, +} + +/// Entry to rust for an AP +#[allow(unused)] +pub unsafe extern "C" fn kstart_ap(_args_ptr: *const KernelArgsAp) -> ! { + loop {} +} diff --git a/src/arch/aarch64/stop.rs b/src/arch/aarch64/stop.rs new file mode 100644 index 0000000000..6c832f9d0e --- /dev/null +++ b/src/arch/aarch64/stop.rs @@ -0,0 +1,33 @@ +use crate::sync::CleanLockToken; +use core::arch::asm; + +pub unsafe fn kreset() -> ! { + unsafe { + println!("kreset"); + + asm!("hvc #0", + in("x0") 0x8400_0009_usize, + options(noreturn), + ) + } +} + +pub unsafe fn emergency_reset() -> ! { + unsafe { + asm!("hvc #0", + in("x0") 0x8400_0009_usize, + options(noreturn), + ) + } +} + +pub unsafe fn kstop(_token: &mut CleanLockToken) -> ! { + unsafe { + println!("kstop"); + + asm!("hvc #0", + in("x0") 0x8400_0008_usize, + options(noreturn), + ) + } +} diff --git a/src/arch/aarch64/time.rs b/src/arch/aarch64/time.rs new file mode 100644 index 0000000000..53e62bb08a --- /dev/null +++ b/src/arch/aarch64/time.rs @@ -0,0 +1,18 @@ +use crate::{sync::CleanLockToken, time::NANOS_PER_SEC}; + +pub fn monotonic_absolute(_token: &mut CleanLockToken) -> u128 { + //TODO: aarch64 generic timer counter + let ticks: usize; + unsafe { core::arch::asm!("mrs {}, cntpct_el0", out(reg) ticks) }; + let freq: usize; + unsafe { core::arch::asm!("mrs {}, cntfrq_el0", out(reg) freq) }; + + ticks as u128 * NANOS_PER_SEC / freq as u128 +} + +pub fn monotonic_resolution() -> u128 { + let freq: usize; + unsafe { core::arch::asm!("mrs {}, cntfrq_el0", out(reg) freq) }; + + NANOS_PER_SEC / freq as u128 +} diff --git a/src/arch/aarch64/vectors.rs b/src/arch/aarch64/vectors.rs new file mode 100644 index 0000000000..f8bf0e46d1 --- /dev/null +++ b/src/arch/aarch64/vectors.rs @@ -0,0 +1,112 @@ +core::arch::global_asm!( + " + // Exception vector stubs + // + // Unhandled exceptions spin in a wfi loop for the moment + // This can be macro-ified + +.globl exception_vector_base + + .align 11 +exception_vector_base: + + // Synchronous + .align 7 +__vec_00: + b synchronous_exception_at_el1_with_sp0 + b __vec_00 + + // IRQ + .align 7 +__vec_01: + b irq_at_el1 + b __vec_01 + + // FIQ + .align 7 +__vec_02: + b unhandled_exception + b __vec_02 + + // SError + .align 7 +__vec_03: + b unhandled_exception + b __vec_03 + + // Synchronous + .align 7 +__vec_04: + b synchronous_exception_at_el1_with_spx + b __vec_04 + + // IRQ + .align 7 +__vec_05: + b irq_at_el1 + b __vec_05 + + // FIQ + .align 7 +__vec_06: + b unhandled_exception + b __vec_06 + + // SError + .align 7 +__vec_07: + b unhandled_exception + b __vec_07 + + // Synchronous + .align 7 +__vec_08: + b synchronous_exception_at_el0 + b __vec_08 + + // IRQ + .align 7 +__vec_09: + b irq_at_el0 + b __vec_09 + + // FIQ + .align 7 +__vec_10: + b unhandled_exception + b __vec_10 + + // SError + .align 7 +__vec_11: + b unhandled_exception + b __vec_11 + + // Synchronous + .align 7 +__vec_12: + b unhandled_exception + b __vec_12 + + // IRQ + .align 7 +__vec_13: + b unhandled_exception + b __vec_13 + + // FIQ + .align 7 +__vec_14: + b unhandled_exception + b __vec_14 + + // SError + .align 7 +__vec_15: + b unhandled_exception + b __vec_15 + + .align 7 +exception_vector_end: +" +); diff --git a/src/arch/mod.rs b/src/arch/mod.rs new file mode 100644 index 0000000000..2aae6399b1 --- /dev/null +++ b/src/arch/mod.rs @@ -0,0 +1,27 @@ +#[cfg(target_arch = "aarch64")] +#[macro_use] +pub mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use self::aarch64::*; + +#[cfg(target_arch = "x86")] +#[macro_use] +pub mod x86; +#[cfg(target_arch = "x86")] +pub use self::x86::*; + +#[cfg(target_arch = "x86_64")] +#[macro_use] +pub mod x86_64; +#[cfg(target_arch = "x86_64")] +pub use self::x86_64::*; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[macro_use] +mod x86_shared; + +#[cfg(target_arch = "riscv64")] +#[macro_use] +pub mod riscv64; +#[cfg(target_arch = "riscv64")] +pub use self::riscv64::*; diff --git a/src/arch/riscv64/consts.rs b/src/arch/riscv64/consts.rs new file mode 100644 index 0000000000..1470ae0e28 --- /dev/null +++ b/src/arch/riscv64/consts.rs @@ -0,0 +1,16 @@ +use super::CurrentRmmArch; +use rmm::Arch; + +const PML4_SHIFT: usize = (CurrentRmmArch::PAGE_LEVELS - 1) * CurrentRmmArch::PAGE_ENTRY_SHIFT + + CurrentRmmArch::PAGE_SHIFT; +/// The size of a single PML4 +pub const PML4_SIZE: usize = 1_usize << PML4_SHIFT; + +/// Offset to kernel heap +#[inline(always)] +pub fn kernel_heap_offset() -> usize { + crate::kernel_executable_offsets::KERNEL_OFFSET() - PML4_SIZE +} + +/// End offset of the user image, i.e. kernel start +pub const USER_END_OFFSET: usize = 1_usize << (CurrentRmmArch::PAGE_ADDRESS_SHIFT - 1); diff --git a/src/arch/riscv64/debug.rs b/src/arch/riscv64/debug.rs new file mode 100644 index 0000000000..b66a081c2e --- /dev/null +++ b/src/arch/riscv64/debug.rs @@ -0,0 +1,19 @@ +use spin::MutexGuard; + +use crate::{arch::device::serial::COM1, devices::serial::SerialKind}; + +pub struct Writer<'a> { + serial: MutexGuard<'a, SerialKind>, +} + +impl<'a> Writer<'a> { + pub fn new() -> Writer<'a> { + Writer { + serial: COM1.lock(), + } + } + + pub fn write(&mut self, buf: &[u8]) { + self.serial.write(buf); + } +} diff --git a/src/arch/riscv64/device/cpu/mod.rs b/src/arch/riscv64/device/cpu/mod.rs new file mode 100644 index 0000000000..23103f1b88 --- /dev/null +++ b/src/arch/riscv64/device/cpu/mod.rs @@ -0,0 +1,5 @@ +use core::fmt::{Result, Write}; + +pub fn cpu_info(_w: &mut W) -> Result { + unimplemented!() +} diff --git a/src/arch/riscv64/device/irqchip/clint.rs b/src/arch/riscv64/device/irqchip/clint.rs new file mode 100644 index 0000000000..273dcb1855 --- /dev/null +++ b/src/arch/riscv64/device/irqchip/clint.rs @@ -0,0 +1,42 @@ +use spin::Mutex; +use syscall::{Io, Mmio}; +use crate::context::switch::tick; + +#[repr(packed(4))] +#[repr(C)] +struct ClintRegs { + /// per-hart MSIP registers + /// bit 0: trigger IPI for the hart + msip: [Mmio; 4095], // +0000 -- 3fff + _rsrv1: u32, + /// per-hart MTIMECMP registers + /// timer interrupt trigger threshold + mtimecmp: [Mmio; 4095], // +4000 - bff7 + mtime: Mmio // current time +} + +pub struct Clint { + regs: &'static mut ClintRegs, + freq: u64 +} + +pub static CLINT: Mutex> = Mutex::new(None); + +impl Clint { + pub fn new(addr: *mut u8, size: usize, freq: usize) -> Self { + assert!(size >= size_of::()); + Self { + regs: unsafe { (addr as *mut ClintRegs).as_mut().unwrap() }, + freq: freq as u64 + } + } + + pub fn init(self: &mut Self) { + (*self.regs).mtimecmp[0].write((*self.regs).mtime.read() + self.freq / 100); + } + + pub fn timer_irq(self: &mut Self, hart: usize) { + (*self.regs).mtimecmp[hart].write((*self.regs).mtimecmp[hart].read() + self.freq / 100); + tick(); + } +} diff --git a/src/arch/riscv64/device/irqchip/clint_sbi.rs b/src/arch/riscv64/device/irqchip/clint_sbi.rs new file mode 100644 index 0000000000..f880c4dd13 --- /dev/null +++ b/src/arch/riscv64/device/irqchip/clint_sbi.rs @@ -0,0 +1,150 @@ +use crate::{ + context, + context::timeout, + dtb::irqchip::{register_irq, InterruptHandler, IrqCell, IRQ_CHIP}, + sync::CleanLockToken, +}; +use alloc::{boxed::Box, vec::Vec}; +use core::{arch::asm, cmp::max}; +use fdt::node::FdtNode; +use spin::Mutex; +// This is a Core-Local Interruptor (CLINT). A single device directly routed into each HLIC +// It is responsible for local timer and IPI interrupts +// An example DTS: +// /soc/ +// clint@2000000/ +// interrupts-extended = <&hlic0 3>, <&hlic0 7>, <&hlic1 3>, <&hlic1 7>, +// <&hlic2 3>, <&hlic2 7>, <&hlic3 3>, <&hlic3 7>; +// reg = <0x200000000 0x10000>; +// compatible = "sifive,clint0", "riscv,clint0"; + +pub struct Clint { + freq: u64, + next_event: Vec, +} + +pub static CLINT: Mutex> = Mutex::new(None); +const TICKS_PER_SECOND: u64 = 100; +const IRQ_IPI: usize = 0; +const IRQ_TIMER: usize = 1; + +struct ClintConnector { + hart_id: usize, + irq: usize, +} + +impl InterruptHandler for ClintConnector { + fn irq_handler(&mut self, _irq: u32, token: &mut CleanLockToken) { + CLINT + .lock() + .as_mut() + .unwrap() + .irq_handler(self.hart_id, self.irq); + if self.irq == IRQ_TIMER { + // a bit of hack, but it is a really bad idea to call scheduler + // from inside clint irq handler + timeout::trigger(token); + context::switch::tick(token); + } + } +} + +fn map_interrupt(irq: u32) -> u32 { + match irq { + 3 => 1, // map M-mode IPI to S-mode IPI + 7 => 5, // map M-mode timer to S-mode timer + x => x, + } +} + +impl Clint { + pub fn new(freq: usize, node: &FdtNode) -> Self { + // TODO IPI + // let reg = clint_node.reg().unwrap().next().unwrap(); + // reg.starting_address.add(crate::PHYS_OFFSET) as *mut u8; + // reg.size.unwrap(); + + let mut me = Self { + freq: freq as u64, + next_event: Vec::new(), + }; + let mut interrupts = node + .property("interrupts-extended") + .unwrap() + .value + .as_chunks::<4>() + .0 + .iter() + .map(|&x| u32::from_be_bytes(x)); + let mut hart_id = 0; + while let Ok([phandle1, irq0, phandle2, irq1]) = interrupts.next_chunk::<4>() { + assert_eq!( + phandle1, phandle2, + "Invalid interrupts-extended property for CLINT" + ); + let hlic = unsafe { + IRQ_CHIP + .irq_chip_list + .chips + .iter() + .find(|x| x.phandle == phandle1) + .expect("Couldn't find HLIC in irqchip list for CLINT") + }; + + // FIXME dirty hack map M-mode interrupts (handled by SBI) to S-mode interrupts we get from SBI + // Why aren't S-mode interrupts in the DTB already? + let irq0 = IrqCell::L1(map_interrupt(irq0)); + let irq1 = IrqCell::L1(map_interrupt(irq1)); + + let virq0 = hlic + .ic + .irq_xlate(irq0) + .expect("Couldn't get virq 0 from HLIC"); + let virq1 = hlic + .ic + .irq_xlate(irq1) + .expect("Couldn't get virq 1 from HLIC"); + register_irq(virq0 as u32, Box::new(ClintConnector { hart_id, irq: 0 })); + register_irq(virq1 as u32, Box::new(ClintConnector { hart_id, irq: 1 })); + hart_id += 1; + } + me.next_event.resize_with(hart_id, || 0); + me + } + + pub(crate) fn irq_handler(self: &mut Self, hart_id: usize, irq: usize) { + match irq { + IRQ_IPI => { + println!("IPI interrupt at {}", hart_id); + } + IRQ_TIMER => { + let mtime: usize; + unsafe { + asm!( + "rdtime t0", + lateout("t0") mtime + ) + }; + + self.next_event[hart_id] = + max(self.next_event[hart_id], mtime as u64) + self.freq / TICKS_PER_SECOND; + sbi_rt::set_timer(self.next_event[hart_id]).expect("SBI timer cannot be set!"); + } + _ => { + panic!("Unexpected CLINT irq") + } + } + } + + pub fn init(self: &mut Self, hart: usize) { + let mtime: usize; + unsafe { + asm!( + "rdtime t0", + lateout("t0") mtime + ) + }; + self.next_event[hart] = mtime as u64 + (self.freq / TICKS_PER_SECOND); + sbi_rt::set_timer(self.next_event[hart]).expect("SBI timer cannot be set!"); + } +} diff --git a/src/arch/riscv64/device/irqchip/hlic.rs b/src/arch/riscv64/device/irqchip/hlic.rs new file mode 100644 index 0000000000..28b04dc8c5 --- /dev/null +++ b/src/arch/riscv64/device/irqchip/hlic.rs @@ -0,0 +1,170 @@ +use crate::{ + dtb::irqchip::{InterruptController, InterruptHandler, IrqCell, IrqDesc, IRQ_CHIP}, + sync::CleanLockToken, +}; +use alloc::vec::Vec; +use core::arch::asm; +use fdt::{node::NodeProperty, Fdt}; +use syscall::{Error, EINVAL}; + +// This is a hart-local interrupt controller, a root of irqchip tree +// An example DTS: +// /cpus/ +// cpu@1/ +// interrupt-controller/ +// #interrupt-cells = 0x00000001 +// interrupt-controller = +// compatible = "riscv,cpu-intc" +// phandle = 0x00000006 + +fn acknowledge(interrupt: usize) { + unsafe { + asm!( + "csrc sip, t0", + in("t0") 1usize << interrupt, + options(nostack) + ) + } +} + +pub unsafe fn interrupt(hart: usize, interrupt: usize, token: &mut CleanLockToken) { + unsafe { + assert!( + hart < CPU_INTERRUPT_HANDLERS.len(), + "Unexpected hart in interrupt routine" + ); + acknowledge(interrupt); + let ic_idx = CPU_INTERRUPT_HANDLERS[hart].unwrap_or_else(|| { + panic!( + "No hlic connected to hart {} yet interrupt {} occurred", + hart, interrupt + ) + }); + let virq = IRQ_CHIP + .irq_to_virq(ic_idx, interrupt as u32) + .unwrap_or_else(|| panic!("HLIC doesn't know of interrupt {}", interrupt)); + match &mut IRQ_CHIP.irq_desc[virq].handler { + Some(handler) => { + handler.irq_handler(virq as u32, token); + } + _ => match IRQ_CHIP.irq_desc[virq].basic.child_ic_idx { + Some(ic_idx) => { + IRQ_CHIP.irq_chip_list.chips[ic_idx] + .ic + .irq_handler(virq as u32, token); + } + _ => { + panic!( + "Unconnected interrupt {} occurred on hlic connected to hart {}", + interrupt, hart + ); + } + }, + } + } +} + +pub fn init() { + unsafe { + asm!( + "csrs sie, t0", + in("t0") (0xFFFF), + options(nostack) + ) + } +} + +static mut CPU_INTERRUPT_HANDLERS: Vec> = Vec::new(); + +pub struct Hlic { + virq_base: usize, +} + +impl Hlic { + pub(crate) fn new() -> Self { + return Self { virq_base: 0 }; + } +} +impl InterruptHandler for Hlic { + fn irq_handler(&mut self, irq: u32, token: &mut CleanLockToken) { + assert!(irq < 16, "Unsupported HLIC interrupt raised!"); + unsafe { + IRQ_CHIP.trigger_virq(self.virq_base as u32 + irq, token); + } + } +} + +impl InterruptController for Hlic { + fn irq_init( + &mut self, + fdt_opt: Option<&Fdt>, + irq_desc: &mut [IrqDesc; 1024], + ic_idx: usize, + irq_idx: &mut usize, + ) -> syscall::Result<()> { + let desc = unsafe { &IRQ_CHIP.irq_chip_list.chips[ic_idx] }; + let fdt = fdt_opt.unwrap(); + let cpu_node = fdt + .find_all_nodes("/cpus/cpu") + .find(|x| { + x.children().any(|x| { + x.property("phandle").and_then(NodeProperty::as_usize) + == Some(desc.phandle as usize) + }) + }) + .expect("Could not find CPU node for HLIC controller"); + let hart = cpu_node.property("reg").unwrap().as_usize().unwrap(); + unsafe { + if CPU_INTERRUPT_HANDLERS.len() <= hart { + CPU_INTERRUPT_HANDLERS.resize(hart + 1, None); + } + assert!( + CPU_INTERRUPT_HANDLERS[hart].replace(ic_idx).is_none(), + "Conflicting HLIC interrupt handler found" + ); + } + self.virq_base = *irq_idx; + for i in 0..16 { + irq_desc[self.virq_base + i].basic.ic_idx = ic_idx; + irq_desc[self.virq_base + i].basic.ic_irq = i as u32; + } + *irq_idx += 16; + Ok(()) + } + + fn irq_ack(&mut self) -> u32 { + panic!("Cannot ack HLIC interrupt"); + } + + fn irq_eoi(&mut self, _irq_num: u32) {} + + fn irq_enable(&mut self, _irq_num: u32) { + // This would require IPI to a correct core + // Not bothering with this, all interrupts are enabled at all times + } + + fn irq_disable(&mut self, _irq_num: u32) { + // This would require IPI to a correct core + // Not bothering with this, all interrupts are enabled at all times + } + + fn irq_xlate(&self, irq_data: IrqCell) -> syscall::Result { + match irq_data { + IrqCell::L1(irq) if irq <= 0xF => Ok(self.virq_base + irq as usize), + _ => Err(Error::new(EINVAL)), + } + } + + fn irq_to_virq(&self, hwirq: u32) -> Option { + if hwirq > 0 && hwirq <= 0xF { + Some(self.virq_base + hwirq as usize) + } else { + None + } + } +} + +pub fn irqchip_for_hart(hart: usize) -> Option { + let value = unsafe { CPU_INTERRUPT_HANDLERS.get(hart) }?; + *value +} diff --git a/src/arch/riscv64/device/irqchip/mod.rs b/src/arch/riscv64/device/irqchip/mod.rs new file mode 100644 index 0000000000..a6d7d2cdb5 --- /dev/null +++ b/src/arch/riscv64/device/irqchip/mod.rs @@ -0,0 +1,44 @@ +use self::clint::Clint; +use crate::dtb::irqchip::InterruptController; +use alloc::boxed::Box; +use fdt::Fdt; + +pub(crate) mod hlic; +mod plic; + +#[path = "clint_sbi.rs"] +mod clint; + +// pub mod clint; // actual clint.rs off limits if SBI is present + +pub fn new_irqchip(ic_str: &str) -> Option> { + if ic_str.contains("riscv,cpu-intc") { + Some(Box::new(hlic::Hlic::new())) + } else if ic_str.contains("riscv,plic0") || ic_str.contains("sifive,plic-1.0.0") { + Some(Box::new(plic::Plic::new())) + } else { + warn!("no driver for interrupt controller {:?}", ic_str); + None + } +} + +pub unsafe fn init_clint(fdt: &Fdt) { + let cpus = fdt.find_node("/cpus").unwrap(); + let clock_freq = cpus + .property("timebase-frequency") + .unwrap() + .as_usize() + .unwrap(); + + let clint_node = fdt.find_node("/soc/clint").unwrap(); + assert!(clint_node + .compatible() + .unwrap() + .all() + .find(|x| (*x).eq("riscv,clint0")) + .is_some()); + + let clint = Clint::new(clock_freq, &clint_node); + *clint::CLINT.lock() = Some(clint); + clint::CLINT.lock().as_mut().unwrap().init(0); +} diff --git a/src/arch/riscv64/device/irqchip/plic.rs b/src/arch/riscv64/device/irqchip/plic.rs new file mode 100644 index 0000000000..ac08ff43f8 --- /dev/null +++ b/src/arch/riscv64/device/irqchip/plic.rs @@ -0,0 +1,198 @@ +use crate::{ + arch::{device::irqchip::hlic, start::BOOT_HART_ID}, + dtb::{ + get_mmio_address, + irqchip::{InterruptController, InterruptHandler, IrqCell, IrqDesc, IRQ_CHIP}, + }, + sync::CleanLockToken, +}; +use core::{mem, num::NonZero, sync::atomic::Ordering}; +use fdt::Fdt; +use syscall::{Error, Io, Mmio, EINVAL}; + +#[repr(packed(4))] +#[repr(C)] +struct InterruptThresholdRegs { + threshold: Mmio, + claim_complete: Mmio, + _rsrv: [u32; 1022], +} + +static MAX_CONTEXTS: usize = 64; + +#[repr(packed(4))] +#[repr(C)] +struct PlicRegs { + /// source priorities + source_priority: [Mmio; 1024], // +0000 -- 0fff + // pending interrupts + pending: [Mmio; 1024], // +1000 -- 1fff + // per-context interrupt enable + enable: [[Mmio; 32]; 16320], // +2000 - 1f'ffff + // per-context priority threshold and acknowledge + thresholds: [InterruptThresholdRegs; 64], // specced at +20'0000 - 0fff'ffff for 15872 contexts + // but actual memory allotted in firmware is much lower +} + +const _: () = assert!(0x1000 == mem::offset_of!(PlicRegs, pending)); +const _: () = assert!(0x2000 == mem::offset_of!(PlicRegs, enable)); +const _: () = assert!(0x20_0000 == mem::offset_of!(PlicRegs, thresholds)); +const _: () = assert!(0x1000 == size_of::()); + +impl PlicRegs { + pub fn set_priority(self: &mut Self, irq: usize, priority: usize) { + assert!(irq > 0 && irq <= 1023 && priority < 8); + self.source_priority[irq].write(priority as u32); + } + + pub fn pending(self: &Self, irq_lane: usize) -> u32 { + assert!(irq_lane < 32); + self.pending[irq_lane].read() + } + + pub fn enable(self: &mut Self, context: usize, irq: NonZero, enable: bool) { + assert!(irq.get() <= 1023 && context < MAX_CONTEXTS); + let irq_lane = irq.get() / 32; + let irq = irq.get() % 32; + self.enable[context][irq_lane].writef(1u32 << irq, enable); + } + + pub fn set_priority_threshold(self: &mut Self, context: usize, priority: usize) { + assert!(context < MAX_CONTEXTS && priority <= 7); + self.thresholds[context].threshold.write(priority as u32); + } + + pub fn claim(self: &mut Self, context: usize) -> Option> { + assert!(context < MAX_CONTEXTS); + let claim = self.thresholds[context].claim_complete.read(); + NonZero::new(claim as usize) + } + + pub fn complete(self: &mut Self, context: usize, claim: NonZero) { + assert!(context < MAX_CONTEXTS); + self.thresholds[context] + .claim_complete + .write(claim.get() as u32); + } +} + +pub struct Plic { + regs: *mut PlicRegs, + ndev: usize, + virq_base: usize, + context: usize, +} + +impl Plic { + pub fn new() -> Self { + Self { + regs: 0 as *mut PlicRegs, + ndev: 0, + virq_base: 0, + context: 0, + } + } +} +impl InterruptHandler for Plic { + fn irq_handler(&mut self, _irq: u32, token: &mut CleanLockToken) { + unsafe { + let irq = self.irq_ack(); + //println!("PLIC interrupt {}", irq); + if let Some(virq) = self.irq_to_virq(irq) { + IRQ_CHIP.trigger_virq(virq as u32, token); + } else { + error!("unexpected irq num {}", irq); + self.irq_eoi(irq); + } + } + //println!("PLIC interrupt done"); + } +} + +impl InterruptController for Plic { + fn irq_init( + &mut self, + fdt_opt: Option<&Fdt>, + irq_desc: &mut [IrqDesc; 1024], + ic_idx: usize, + irq_idx: &mut usize, + ) -> syscall::Result<()> { + let desc = unsafe { &IRQ_CHIP.irq_chip_list.chips[ic_idx] }; + let fdt = fdt_opt.unwrap(); + let my_node = fdt.find_phandle(desc.phandle).unwrap(); + + // MMIO region + let reg = my_node.reg().unwrap().next().unwrap(); + let addr = get_mmio_address(&fdt, &my_node, ®).unwrap(); + // Specifies how many external interrupts are supported by this controller. + let ndev = my_node + .property("riscv,ndev") + .and_then(|x| x.as_usize()) + .unwrap(); + + self.regs = (addr + crate::PHYS_OFFSET) as *mut PlicRegs; + self.ndev = ndev; + + self.virq_base = *irq_idx; + for i in 0..ndev { + irq_desc[self.virq_base + i].basic.ic_idx = ic_idx; + irq_desc[self.virq_base + i].basic.ic_irq = i as u32; + } + *irq_idx += ndev; + + // route all interrupts to boot HART + // TODO spread irqs over all the cores when we have them? + let hlic_ic_idx = hlic::irqchip_for_hart(BOOT_HART_ID.load(Ordering::Relaxed)) + .expect("Could not find HLIC irqchip for the boot hart while initing PLIC"); + self.context = desc + .parents + .iter() + .position(|x| x.parent_interrupt.is_some() && x.parent == hlic_ic_idx) + .unwrap(); + info!("PLIC: using context {}", self.context); + + let regs = unsafe { self.regs.as_mut().unwrap() }; + regs.set_priority_threshold(self.context, 0); + + Ok(()) + } + + fn irq_ack(&mut self) -> u32 { + let regs = unsafe { self.regs.as_mut().unwrap() }; + regs.claim(self.context).unwrap().get() as u32 + } + + fn irq_eoi(&mut self, irq_num: u32) { + let regs = unsafe { self.regs.as_mut().unwrap() }; + regs.complete(self.context, NonZero::new(irq_num as usize).unwrap()); + } + + fn irq_enable(&mut self, irq_num: u32) { + assert!(irq_num > 0 && irq_num as usize <= self.ndev); + let regs = unsafe { self.regs.as_mut().unwrap() }; + regs.set_priority(irq_num as usize, 1); + regs.enable(self.context, NonZero::new(irq_num as usize).unwrap(), true); + } + + fn irq_disable(&mut self, irq_num: u32) { + assert!(irq_num > 0 && irq_num as usize <= self.ndev); + let regs = unsafe { self.regs.as_mut().unwrap() }; + regs.set_priority(irq_num as usize, 1); + regs.enable(self.context, NonZero::new(irq_num as usize).unwrap(), false); + } + + fn irq_xlate(&self, irq_data: IrqCell) -> syscall::Result { + match irq_data { + IrqCell::L1(irq) => Ok(self.virq_base + irq as usize), + _ => Err(Error::new(EINVAL)), + } + } + + fn irq_to_virq(&self, hwirq: u32) -> Option { + if (hwirq as usize) < self.ndev { + Some(self.virq_base + hwirq as usize) + } else { + None + } + } +} diff --git a/src/arch/riscv64/device/mod.rs b/src/arch/riscv64/device/mod.rs new file mode 100644 index 0000000000..f072f943ae --- /dev/null +++ b/src/arch/riscv64/device/mod.rs @@ -0,0 +1,110 @@ +use crate::{ + arch::{device::irqchip::hlic, time}, + dtb::DTB_BINARY, +}; +use fdt::{ + node::{FdtNode, NodeProperty}, + Fdt, +}; + +pub mod cpu; +pub(crate) mod irqchip; +pub mod serial; + +use crate::arch::device::irqchip::init_clint; + +fn string_property(name: &str) -> bool { + name == "compatible" + || name == "model" + || name == "device_type" + || name == "status" + || name == "riscv,isa-base" + || name == "riscv,isa" + || name == "mmu-type" + || name == "stdout-path" +} + +fn print_property(prop: &NodeProperty, n_spaces: usize) { + (0..n_spaces).for_each(|_| print!(" ")); + print!("{} =", prop.name); + if string_property(prop.name) + && let Some(str) = prop.as_str() + { + println!(" \"{}\"", str); + } else if let Some(value) = prop.as_usize() { + println!(" 0x{:08x}", value); + } else { + for v in prop.value { + print!(" {:02x}", v); + } + println!(); + } +} +fn print_node(node: &FdtNode<'_, '_>, n_spaces: usize) { + (0..n_spaces).for_each(|_| print!(" ")); + println!("{}/", node.name); + for prop in node.properties() { + print_property(&prop, n_spaces + 4); + } + + for child in node.children() { + print_node(&child, n_spaces + 4); + } +} + +pub(crate) fn dump_fdt(fdt: &Fdt) { + if let Some(root) = fdt.find_node("/") { + print_node(&root, 0); + } +} + +unsafe fn init_intc(cpu: &FdtNode) { + let intc_node = cpu + .children() + .find(|x| x.name == "interrupt-controller") + .unwrap(); + assert_eq!(intc_node.compatible().unwrap().first(), "riscv,cpu-intc"); + // This controller is hardwired into interrupt handler code and has no Mmios + hlic::init(); // enable interrupts at HLIC level +} + +pub unsafe fn init() { + unsafe { + let data = DTB_BINARY.get().unwrap(); + let fdt = Fdt::new(data).unwrap(); + + crate::dtb::irqchip::init(&fdt); + + let cpu = fdt.find_node(format!("/cpus/cpu@{}", 0).as_str()).unwrap(); + init_intc(&cpu); + init_time(&fdt); + } +} + +fn init_time(fdt: &Fdt) { + let cpus = fdt.find_node("/cpus").unwrap(); + let clock_freq = cpus + .property("timebase-frequency") + .unwrap() + .as_usize() + .unwrap(); + time::init(clock_freq); +} + +pub unsafe fn init_noncore() { + unsafe { + let data = DTB_BINARY.get().unwrap(); + let fdt = Fdt::new(data).unwrap(); + + init_clint(&fdt); + serial::init(&fdt); + } +} + +pub struct ArchPercpuMisc; + +impl ArchPercpuMisc { + pub const fn default() -> Self { + Self + } +} diff --git a/src/arch/riscv64/device/serial.rs b/src/arch/riscv64/device/serial.rs new file mode 100644 index 0000000000..4aae604a62 --- /dev/null +++ b/src/arch/riscv64/device/serial.rs @@ -0,0 +1,47 @@ +use alloc::boxed::Box; +use fdt::Fdt; + +pub use crate::dtb::serial::COM1; +use crate::{ + dtb::{ + get_interrupt, interrupt_parent, + irqchip::{register_irq, InterruptHandler, IRQ_CHIP}, + }, + scheme::irq::irq_trigger, + sync::CleanLockToken, +}; + +pub struct Com1Irq {} + +impl InterruptHandler for Com1Irq { + fn irq_handler(&mut self, irq: u32, token: &mut CleanLockToken) { + COM1.lock().receive(token); + unsafe { + // FIXME add_irq accepts a u8 as irq number + // PercpuBlock::current().stats.add_irq(irq); + irq_trigger(irq.try_into().unwrap(), token); + IRQ_CHIP.irq_eoi(irq); + } + } +} + +pub unsafe fn init(fdt: &Fdt) -> Option<()> { + unsafe { + if let Some(node) = fdt.find_compatible(&["ns16550a", "snps,dw-apb-uart"]) { + let intr = get_interrupt(fdt, &node, 0).unwrap(); + let interrupt_parent = interrupt_parent(fdt, &node)?; + let phandle = interrupt_parent.property("phandle")?.as_usize()? as u32; + let ic_idx = IRQ_CHIP.phandle_to_ic_idx(phandle)?; + + let virq = IRQ_CHIP.irq_chip_list.chips[ic_idx] + .ic + .irq_xlate(intr) + .unwrap(); + info!("serial_port virq = {}", virq); + register_irq(virq as u32, Box::new(Com1Irq {})); + IRQ_CHIP.irq_enable(virq as u32); + } + // COM1.lock().enable_irq(); // FIXME receive int is enabled by default in 16550. Disable by default? + Some(()) + } +} diff --git a/src/arch/riscv64/interrupt/exception.rs b/src/arch/riscv64/interrupt/exception.rs new file mode 100644 index 0000000000..6770a5f24d --- /dev/null +++ b/src/arch/riscv64/interrupt/exception.rs @@ -0,0 +1,229 @@ +use ::syscall::Exception; +use core::{arch::naked_asm, sync::atomic::Ordering}; +use rmm::VirtualAddress; + +use crate::{ + arch::{device::irqchip, start::BOOT_HART_ID}, + context::signal::excp_handler, + memory::GenericPfFlags, + ptrace, + sync::CleanLockToken, + syscall::{self, flag::*}, +}; + +const BREAKPOINT: usize = 3; +const USERMODE_ECALL: usize = 8; +const INSTRUCTION_PAGE_FAULT: usize = 12; +const LOAD_PAGE_FAULT: usize = 13; +const STORE_PAGE_FAULT: usize = 15; + +use super::InterruptStack; + +#[unsafe(naked)] +// FIXME use extern "custom" +// FIXME use align(4) +pub unsafe extern "C" fn exception_handler() { + naked_asm!( + "csrrw tp, sscratch, tp", + "beq tp, x0, 3f", // exception before percpu data is available; got to be S mode + + "sd t0, 0(tp)", + "csrr t0, sstatus", + "andi t0, t0, 1<<8",// SPP bit + "bne t0, x0, 2f", + + // trap/interrupt from U mode, switch stacks + "ld t0, 0(tp)", + "sd sp, 0(tp)", + "ld sp, 8(tp)", + + push_registers!(), + "ld t0, 0(tp)", + "sd t0, (1 * 8)(sp)", // save original SP + "csrrw t0, sscratch, tp", + "sd t0, (3 * 8)(sp)", // save original TP, and restore sscratch to handle double faults + + "mv a0, sp", + "jal {0}", + + // save S mode stack to percpu + "addi t0, sp, 32 * 8", + "sd t0, 8(tp)", + "li t0, 1 << 8", // return to U mode (sstatus might've been modified by nested trap or context switch) + "csrc sstatus, t0", + "j 4f", + + "2: ld t0, 0(tp)", // S-mode + "3:", // S mode early + + "addi sp, sp, -2 * 8", // fake stack frame for the stack tracer + + push_registers!(), + + "addi t1, sp, 34 * 8", + "sd t1, (1 * 8)(sp)", // save original SP + "csrrw t1, sscratch, tp", + "sd t1, (3 * 8)(sp)", // save original TP, and restore sscratch to handle double faults + + "sd t0, (33 * 8)(sp)", // fill the stack frame. t0 holds original pc after push_registers + "sd fp, (32 * 8)(sp)", + "addi fp, sp, 34 * 8", + + "mv a0, sp", + "jal {0}", + // return to S mode with interrupts disabled + // (sstatus might've been modified by nested trap or context switch) + "li t0, 1 << 8", + "csrs sstatus, t0", + "li t0, 1 << 5", + "csrc sstatus, t0", + + "4:", + pop_registers!(), + "sret", + sym exception_handler_inner + ); +} + +unsafe fn exception_handler_inner(regs: &mut InterruptStack) { + unsafe { + let scause: usize; + let sstatus: usize; + core::arch::asm!( + "csrr t0, scause", + "csrr t1, sstatus", + lateout("t0") scause, + lateout("t1") sstatus, + options(nostack) + ); + + //info!("Exception handler incoming: sepc={:x} scause={:x} sstatus={:x}", regs.iret.sepc, scause, sstatus); + + let user_mode = sstatus & (1 << 8) == 0; + + if (scause as isize) < 0 { + handle_interrupt(scause & 0xF); + } else if page_fault(scause, regs, user_mode) { + } else if user_mode { + handle_user_exception(scause, regs); + } else { + handle_system_exception(scause, regs); + } + //info!("Exception handler outgoing"); + } +} + +unsafe fn handle_system_exception(scause: usize, regs: &InterruptStack) { + unsafe { + let stval: usize; + let tp: usize; + core::arch::asm!( + "csrr t0, stval", + "mv t1, tp", + lateout("t0") stval, + lateout("t1") tp, + options(nostack) + ); + + error!( + "S-mode exception! scause={:#016x}, stval={:#016x}", + scause, stval + ); + + if tp == 0 { + // Early failure - before misc::init and potentially before RMM init + // Do not attempt to trace stack because it would probably trap again + regs.dump(); + } else { + regs.trace(); + } + loop {} + } +} + +unsafe fn handle_interrupt(interrupt: usize) { + unsafe { + let mut token = CleanLockToken::new(); + // FIXME retrieve from percpu area + // For now all the interrupts go to boot hart so this suffices... + let hart: usize = BOOT_HART_ID.load(Ordering::Relaxed); + irqchip::hlic::interrupt(hart, interrupt, &mut token); + } +} + +unsafe fn handle_user_exception(scause: usize, regs: &mut InterruptStack) { + unsafe { + let mut token = CleanLockToken::new(); + + if scause == USERMODE_ECALL { + let r = &mut regs.registers; + regs.iret.sepc += 4; // skip ecall + let ret = syscall::syscall(r.x17, r.x10, r.x11, r.x12, r.x13, r.x14, r.x15, &mut token); + r.x10 = ret; + return; + } + + if scause == BREAKPOINT { + if ptrace::breakpoint_callback(PTRACE_STOP_BREAKPOINT, None, &mut token).is_some() { + return; + } + } + + let stval: usize; + core::arch::asm!( + "csrr t0, stval", + lateout("t0") stval, + options(nostack) + ); + + info!( + "U-mode exception! scause={:#016x}, stval={:#016x}", + scause, stval + ); + regs.dump(); + + // TODO + /* + let signal = match scause { + 0 | 4 | 6 | 18 | 19 => SIGBUS, // misaligned / machine check + 2 | 8 | 9 => SIGILL, // Illegal instruction / breakpoint / ecall + BREAKPOINT => SIGTRAP, + _ => SIGSEGV, + }; + */ + excp_handler(Exception { kind: scause }); + } +} + +unsafe fn page_fault(scause: usize, regs: &mut InterruptStack, user_mode: bool) -> bool { + unsafe { + if scause != INSTRUCTION_PAGE_FAULT + && scause != LOAD_PAGE_FAULT + && scause != STORE_PAGE_FAULT + { + return false; + } + + let stval: usize; + core::arch::asm!( + "csrr t0, stval", + lateout("t0") stval, + options(nostack) + ); + + let address = VirtualAddress::new(stval); + let mut generic_flags = GenericPfFlags::empty(); + + generic_flags.set(GenericPfFlags::INVOLVED_WRITE, scause == STORE_PAGE_FAULT); + generic_flags.set(GenericPfFlags::USER_NOT_SUPERVISOR, user_mode); + generic_flags.set( + GenericPfFlags::INSTR_NOT_DATA, + scause == INSTRUCTION_PAGE_FAULT, + ); + // FIXME can these conditions be distinguished? Should they be? + generic_flags.set(GenericPfFlags::INVL, false); + generic_flags.set(GenericPfFlags::PRESENT, false); + + crate::memory::page_fault_handler(regs, generic_flags, address).is_ok() + } +} diff --git a/src/arch/riscv64/interrupt/handler.rs b/src/arch/riscv64/interrupt/handler.rs new file mode 100644 index 0000000000..5b2470ee8c --- /dev/null +++ b/src/arch/riscv64/interrupt/handler.rs @@ -0,0 +1,332 @@ +use crate::{memory::ArchIntCtx, panic, syscall::IntRegisters}; + +#[derive(Default)] +#[repr(C)] +pub struct Registers { + pub x1: usize, // ra + pub x2: usize, // sp + pub x3: usize, // gp + pub x4: usize, // tp + pub x5: usize, // t0 + pub x6: usize, // t1 + pub x7: usize, // t2 + pub x8: usize, // s0/fp + pub x9: usize, // s1 + pub x10: usize, // a0... + pub x11: usize, + pub x12: usize, + pub x13: usize, + pub x14: usize, + pub x15: usize, + pub x16: usize, + pub x17: usize, // a7 + pub x18: usize, // s2... + pub x19: usize, + pub x20: usize, + pub x21: usize, + pub x22: usize, + pub x23: usize, + pub x24: usize, + pub x25: usize, + pub x26: usize, + pub x27: usize, // s11 + pub x28: usize, // t3... + pub x29: usize, + pub x30: usize, + pub x31: usize, // t6 +} + +impl Registers { + pub fn dump(&self) { + println!("X1: {:>016X}", { self.x1 }); + println!("X2: {:>016X}", { self.x2 }); + println!("X3: {:>016X}", { self.x3 }); + println!("X4: {:>016X}", { self.x4 }); + println!("X5: {:>016X}", { self.x5 }); + println!("X6: {:>016X}", { self.x6 }); + println!("X7: {:>016X}", { self.x7 }); + println!("X8: {:>016X}", { self.x8 }); + println!("X9: {:>016X}", { self.x9 }); + println!("X10: {:>016X}", { self.x10 }); + println!("X11: {:>016X}", { self.x11 }); + println!("X12: {:>016X}", { self.x12 }); + println!("X13: {:>016X}", { self.x13 }); + println!("X14: {:>016X}", { self.x14 }); + println!("X15: {:>016X}", { self.x15 }); + println!("X16: {:>016X}", { self.x16 }); + println!("X17: {:>016X}", { self.x17 }); + println!("X18: {:>016X}", { self.x18 }); + println!("X19: {:>016X}", { self.x19 }); + println!("X20: {:>016X}", { self.x20 }); + println!("X21: {:>016X}", { self.x21 }); + println!("X22: {:>016X}", { self.x22 }); + println!("X23: {:>016X}", { self.x23 }); + println!("X24: {:>016X}", { self.x24 }); + println!("X25: {:>016X}", { self.x25 }); + println!("X26: {:>016X}", { self.x26 }); + println!("X27: {:>016X}", { self.x27 }); + println!("X28: {:>016X}", { self.x28 }); + println!("X29: {:>016X}", { self.x29 }); + println!("X30: {:>016X}", { self.x30 }); + println!("X31: {:>016X}", { self.x31 }); + } +} + +#[derive(Default)] +#[repr(C)] +pub struct IretRegisters { + pub sepc: usize, +} + +impl IretRegisters { + pub fn dump(&self) { + println!("SEPC: {:>016X}", { self.sepc }); + } +} + +// NOTE: Layout of this structure must be synced with assembly code in exception.rs +#[derive(Default)] +#[repr(C)] +pub struct InterruptStack { + pub registers: Registers, + pub iret: IretRegisters, +} + +impl InterruptStack { + pub fn init(&mut self) { + const { + assert!(32 * 8 == size_of::()); + } + } + pub fn frame_pointer(&self) -> usize { + self.registers.x8 + } + pub fn stack_pointer(&self) -> usize { + self.registers.x2 + } + pub fn set_stack_pointer(&mut self, sp: usize) { + self.registers.x2 = sp; + } + pub fn set_instr_pointer(&mut self, ip: usize) { + self.iret.sepc = ip; + } + pub fn instr_pointer(&self) -> usize { + self.iret.sepc + } + pub fn sig_archdep_reg(&self) -> usize { + self.registers.x5 + } + + pub fn set_syscall_ret_reg(&mut self, ret: usize) { + self.registers.x10 = ret; + } + + pub fn set_arg1(&mut self, arg_opt: Option) { + if let Some(arg) = arg_opt { + self.registers.x11 = arg; + } + } + + pub fn dump(&self) { + self.iret.dump(); + self.registers.dump(); + } + + pub fn trace(&self) { + self.dump(); + unsafe { + panic::user_stack_trace(&self); + panic::stack_trace(); + } + } + + /// Saves all registers to a struct used by the proc: + /// scheme to read/write registers. + pub fn save(&self, all: &mut IntRegisters) { + all.pc = self.iret.sepc; + all.x31 = self.registers.x31; + all.x30 = self.registers.x30; + all.x29 = self.registers.x29; + all.x28 = self.registers.x28; + all.x27 = self.registers.x27; + all.x26 = self.registers.x26; + all.x25 = self.registers.x25; + all.x24 = self.registers.x24; + all.x23 = self.registers.x23; + all.x22 = self.registers.x22; + all.x21 = self.registers.x21; + all.x20 = self.registers.x20; + all.x19 = self.registers.x19; + all.x18 = self.registers.x18; + all.x17 = self.registers.x17; + all.x16 = self.registers.x16; + all.x15 = self.registers.x15; + all.x14 = self.registers.x14; + all.x13 = self.registers.x13; + all.x12 = self.registers.x12; + all.x11 = self.registers.x11; + all.x10 = self.registers.x10; + all.x9 = self.registers.x9; + all.x8 = self.registers.x8; + all.x7 = self.registers.x7; + all.x6 = self.registers.x6; + all.x5 = self.registers.x5; + all.x2 = self.registers.x2; + all.x1 = self.registers.x1; + } + + /// Loads all registers from a struct used by the proc: + /// scheme to read/write registers. + pub fn load(&mut self, all: &IntRegisters) { + self.iret.sepc = all.pc; + self.registers.x31 = all.x31; + self.registers.x30 = all.x30; + self.registers.x29 = all.x29; + self.registers.x28 = all.x28; + self.registers.x27 = all.x27; + self.registers.x26 = all.x26; + self.registers.x25 = all.x25; + self.registers.x24 = all.x24; + self.registers.x23 = all.x23; + self.registers.x22 = all.x22; + self.registers.x21 = all.x21; + self.registers.x20 = all.x20; + self.registers.x19 = all.x19; + self.registers.x18 = all.x18; + self.registers.x17 = all.x17; + self.registers.x16 = all.x16; + self.registers.x15 = all.x15; + self.registers.x14 = all.x14; + self.registers.x13 = all.x13; + self.registers.x12 = all.x12; + self.registers.x11 = all.x11; + self.registers.x10 = all.x10; + self.registers.x9 = all.x9; + self.registers.x8 = all.x8; + self.registers.x7 = all.x7; + self.registers.x6 = all.x6; + self.registers.x5 = all.x5; + self.registers.x2 = all.x2; + self.registers.x1 = all.x1; + } + + //TODO + pub fn is_singlestep(&self) -> bool { + false + } + pub fn set_singlestep(&mut self, _singlestep: bool) {} +} + +impl ArchIntCtx for InterruptStack { + fn ip(&self) -> usize { + self.iret.sepc + } + fn recover_and_efault(&mut self) { + // Set the return value to nonzero to indicate usercopy failure (EFAULT), and emulate the + // return instruction by setting the return pointer to the saved LR value. + self.iret.sepc = self.registers.x1; // ra + self.registers.x10 = 1; // a0 + } +} + +/// Except for sp and tp +#[macro_export] +macro_rules! push_registers { + () => { + " + addi sp, sp, -32 * 8 + sd x1, (0 * 8)(sp) + // skip sp + sd x3, (2 * 8)(sp) + // skip tp + sd x5, (4 * 8)(sp) + sd x6, (5 * 8)(sp) + sd x7, (6 * 8)(sp) + sd x8, (7 * 8)(sp) + sd x9, (8 * 8)(sp) + sd x10, (9 * 8)(sp) + sd x11, (10 * 8)(sp) + sd x12, (11 * 8)(sp) + sd x13, (12 * 8)(sp) + sd x14, (13 * 8)(sp) + sd x15, (14 * 8)(sp) + sd x16, (15 * 8)(sp) + sd x17, (16 * 8)(sp) + sd x18, (17 * 8)(sp) + sd x19, (18 * 8)(sp) + sd x20, (19 * 8)(sp) + sd x21, (20 * 8)(sp) + sd x22, (21 * 8)(sp) + sd x23, (22 * 8)(sp) + sd x24, (23 * 8)(sp) + sd x25, (24 * 8)(sp) + sd x26, (25 * 8)(sp) + sd x27, (26 * 8)(sp) + sd x28, (27 * 8)(sp) + sd x29, (28 * 8)(sp) + sd x30, (29 * 8)(sp) + sd x31, (30 * 8)(sp) + + csrr t0, sepc + sd t0, (31 * 8)(sp) + " + }; // keep sepc value in t0 on exit +} + +#[macro_export] +macro_rules! pop_registers { + () => { + " + ld t0, (31 * 8)(sp) + csrw sepc, t0 + + ld x1, (0 * 8)(sp) + // skip sp, it'll be restored later + ld x3, (2 * 8)(sp) + ld x4, (3 * 8)(sp) + ld x5, (4 * 8)(sp) + ld x6, (5 * 8)(sp) + ld x7, (6 * 8)(sp) + ld x8, (7 * 8)(sp) + ld x9, (8 * 8)(sp) + ld x10, (9 * 8)(sp) + ld x11, (10 * 8)(sp) + ld x12, (11 * 8)(sp) + ld x13, (12 * 8)(sp) + ld x14, (13 * 8)(sp) + ld x15, (14 * 8)(sp) + ld x16, (15 * 8)(sp) + ld x17, (16 * 8)(sp) + ld x18, (17 * 8)(sp) + ld x19, (18 * 8)(sp) + ld x20, (19 * 8)(sp) + ld x21, (20 * 8)(sp) + ld x22, (21 * 8)(sp) + ld x23, (22 * 8)(sp) + ld x24, (23 * 8)(sp) + ld x25, (24 * 8)(sp) + ld x26, (25 * 8)(sp) + ld x27, (26 * 8)(sp) + ld x28, (27 * 8)(sp) + ld x29, (28 * 8)(sp) + ld x30, (29 * 8)(sp) + ld x31, (30 * 8)(sp) + ld sp, (1 * 8)(sp) + " + }; +} + +#[unsafe(naked)] +pub unsafe extern "C" fn enter_usermode() -> ! { + core::arch::naked_asm!( + "jalr s11", + "li t0, 1 << 8", // force U mode on sret + "csrc sstatus, t0", + "li t0, 0x6000", // set FS to dirty (enable FPU in U mode) + "csrs sstatus, t0", + "addi t0, sp, 32 * 8", // save S mode stack to percpu + "sd t0, 8(tp)", + pop_registers!(), + "sret", + ) +} diff --git a/src/arch/riscv64/interrupt/mod.rs b/src/arch/riscv64/interrupt/mod.rs new file mode 100644 index 0000000000..e60df94659 --- /dev/null +++ b/src/arch/riscv64/interrupt/mod.rs @@ -0,0 +1,39 @@ +use core::arch::asm; + +#[macro_use] +mod handler; + +mod exception; +pub mod syscall; +pub mod trace; + +pub use exception::exception_handler; +pub use handler::InterruptStack; + +/// Clear interrupts +#[inline(always)] +pub unsafe fn disable() { + unsafe { asm!("csrci sstatus, 1 << 1") } +} + +/// Set interrupts and halt +/// This will atomically wait for the next interrupt +/// Performing enable followed by halt is not guaranteed to be atomic, use this instead! +#[inline(always)] +pub unsafe fn enable_and_halt() { + unsafe { asm!("wfi", "csrsi sstatus, 1 << 1", "nop") } +} + +/// Set interrupts and nop +/// This will enable interrupts and allow the IF flag to be processed +/// Simply enabling interrupts does not gurantee that they will trigger, use this instead! +#[inline(always)] +pub unsafe fn enable_and_nop() { + unsafe { asm!("csrsi sstatus, 1 << 1", "nop") } +} + +/// Halt instruction +#[inline(always)] +pub unsafe fn halt() { + unsafe { asm!("wfi", options(nomem, nostack)) } +} diff --git a/src/arch/riscv64/interrupt/syscall.rs b/src/arch/riscv64/interrupt/syscall.rs new file mode 100644 index 0000000000..b7c675146c --- /dev/null +++ b/src/arch/riscv64/interrupt/syscall.rs @@ -0,0 +1 @@ +pub use super::handler::enter_usermode; diff --git a/src/arch/riscv64/interrupt/trace.rs b/src/arch/riscv64/interrupt/trace.rs new file mode 100644 index 0000000000..5d24fe7503 --- /dev/null +++ b/src/arch/riscv64/interrupt/trace.rs @@ -0,0 +1,35 @@ +use core::arch::asm; + +pub struct StackTrace { + pub fp: usize, + pub pc_ptr: *const usize, +} + +impl StackTrace { + #[inline(always)] + pub unsafe fn start() -> Option { + unsafe { + let fp: usize; + asm!("mv {}, fp", out(reg) fp); + + let pc_ptr = fp.checked_sub(size_of::())?; + let fp = pc_ptr.checked_sub(size_of::())?; + Some(StackTrace { + fp, + pc_ptr: pc_ptr as *const usize, + }) + } + } + + pub unsafe fn next(self) -> Option { + unsafe { + let fp = *(self.fp as *const usize); + let pc_ptr = fp.checked_sub(size_of::())?; + let fp = pc_ptr.checked_sub(size_of::())?; + Some(StackTrace { + fp: fp, + pc_ptr: pc_ptr as *const usize, + }) + } + } +} diff --git a/src/arch/riscv64/ipi.rs b/src/arch/riscv64/ipi.rs new file mode 100644 index 0000000000..3917398c82 --- /dev/null +++ b/src/arch/riscv64/ipi.rs @@ -0,0 +1,34 @@ +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum IpiKind { + Wakeup = 0x40, + Tlb = 0x41, + Switch = 0x42, + Pit = 0x43, +} + +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum IpiTarget { + Current = 1, + All = 2, + Other = 3, +} + +#[inline(always)] +pub fn ipi(_kind: IpiKind, _target: IpiTarget) { + if cfg!(not(feature = "multi_core")) { + return; + } + + // FIXME implement +} + +#[inline(always)] +pub fn ipi_single(_kind: IpiKind, _target: &crate::percpu::PercpuBlock) { + if cfg!(not(feature = "multi_core")) { + return; + } + + // FIXME implement +} diff --git a/src/arch/riscv64/misc.rs b/src/arch/riscv64/misc.rs new file mode 100644 index 0000000000..02b27632ea --- /dev/null +++ b/src/arch/riscv64/misc.rs @@ -0,0 +1,47 @@ +use core::arch::asm; + +use crate::{ + cpu_set::LogicalCpuId, + memory::{RmmA, RmmArch}, + percpu::PercpuBlock, +}; + +#[repr(C)] +pub struct ArchPercpu { + // These fields must be kept first and in this order. Assembly in exception.rs depends on it + pub tmp: usize, + pub s_sp: usize, + + pub percpu: PercpuBlock, +} + +impl PercpuBlock { + pub fn current() -> &'static Self { + unsafe { + let tp: *const ArchPercpu; + asm!( "mv t0, tp", out("t0") tp ); + let arch_percpu = &*tp; + &arch_percpu.percpu + } + } +} + +#[cold] +pub unsafe fn init(cpu_id: LogicalCpuId) { + unsafe { + let frame = crate::memory::allocate_frame().expect("failed to allocate percpu memory"); + let virt = RmmA::phys_to_virt(frame.base()).data() as *mut ArchPercpu; + + virt.write(ArchPercpu { + tmp: 0, + s_sp: 0, + percpu: PercpuBlock::init(cpu_id), + }); + + asm!( + "mv tp, {}", + "csrw sscratch, tp", + in(reg) virt as usize + ); + } +} diff --git a/src/arch/riscv64/mod.rs b/src/arch/riscv64/mod.rs new file mode 100644 index 0000000000..e5ff0b276b --- /dev/null +++ b/src/arch/riscv64/mod.rs @@ -0,0 +1,69 @@ +pub mod consts; +pub mod debug; +pub mod device; +pub mod interrupt; +pub mod ipi; +pub mod misc; +pub mod paging; +pub mod start; +pub mod stop; +pub mod time; + +pub use ::rmm::riscv64::RiscV64Sv39Arch as CurrentRmmArch; +use core::arch::naked_asm; + +pub use arch_copy_to_user as arch_copy_from_user; + +#[unsafe(naked)] +pub unsafe extern "C" fn arch_copy_to_user(dst: usize, src: usize, len: usize) -> u8 { + naked_asm!( + " + .global __usercopy_start + __usercopy_start: + addi sp, sp, -16 + sd fp, 0(sp) + sd ra, 8(sp) + addi fp, sp, 16 + li t1, 1 << 18 // SUM + csrs sstatus, t1 + jal 2f + csrc sstatus, t1 + ld ra, -8(fp) + ld fp, -16(fp) + addi sp, sp, 16 + ret + + 2: or t0, a0, a1 + andi t0, t0, 7 + bne t0, x0, 4f + srli t2, a2, 3 + andi a2, a2, 7 + beq t2, x0, 4f + 3: ld t0, 0(a1) + sd t0, 0(a0) + addi a0, a0, 8 + addi a1, a1, 8 + addi t2, t2, -1 + bne t2, x0, 3b + + 4: beq a2, x0, 5f + lb t0, 0(a1) + sb t0, 0(a0) + addi a0, a0, 1 + addi a1, a1, 1 + addi a2, a2, -1 + bne a2, x0, 4b + 5: mv a0, x0 + ret + .global __usercopy_end + __usercopy_end: + " + ) +} + +pub const KFX_SIZE: usize = 1024; + +// This function exists as the KFX size is dynamic on x86_64. +pub fn kfx_size() -> usize { + KFX_SIZE +} diff --git a/src/arch/riscv64/paging.rs b/src/arch/riscv64/paging.rs new file mode 100644 index 0000000000..996e4963f4 --- /dev/null +++ b/src/arch/riscv64/paging.rs @@ -0,0 +1,5 @@ +#[cold] +pub unsafe fn init() { + // Assuming SBI already set up PMAs correctly for us + // TODO: detect Svpbmt present/enabled and override device memory with PBMT=IO +} diff --git a/src/arch/riscv64/start.rs b/src/arch/riscv64/start.rs new file mode 100644 index 0000000000..2551968f05 --- /dev/null +++ b/src/arch/riscv64/start.rs @@ -0,0 +1,140 @@ +use core::{ + arch::naked_asm, + cell::SyncUnsafeCell, + sync::atomic::{AtomicUsize, Ordering}, +}; + +use crate::{ + allocator, + arch::{device, interrupt::exception_handler, paging}, + devices::graphical_debug, + dtb::serial::init_early, + startup::KernelArgs, +}; + +/// Test of zero values in BSS. +static mut BSS_TEST_ZERO: usize = 0; +/// Test of non-zero values in data. +static mut DATA_TEST_NONZERO: usize = 0xFFFF_FFFF_FFFF_FFFF; + +pub static BOOT_HART_ID: AtomicUsize = AtomicUsize::new(0); + +fn get_boot_hart_id(env: &[u8]) -> Option { + for line in core::str::from_utf8(env).unwrap_or("").lines() { + let mut parts = line.splitn(2, '='); + let name = parts.next().unwrap_or(""); + let value = parts.next().unwrap_or(""); + + if name == "BOOT_HART_ID" { + return usize::from_str_radix(value, 16).ok(); + } + } + None +} + +#[repr(C, align(16))] +struct StackAlign(T); + +static STACK: SyncUnsafeCell> = + SyncUnsafeCell::new(StackAlign([0; 128 * 1024])); + +// FIXME use extern "custom" +#[unsafe(naked)] +#[unsafe(no_mangle)] +extern "C" fn kstart() { + naked_asm!(" + mv gp, x0 // ensure gp relative accesses crash + mv tp, x0 // reset percpu until it is initialized + csrw sscratch, tp + + // BSS should already be zero + ld t0, {bss_test_zero} + bnez t0, .Lkstart_crash + ld t0, {data_test_nonzero} + beqz t0, .Lkstart_crash + + .Lpcrel_hi0: + auipc sp, %pcrel_hi({stack}+{stack_size}-16) + addi sp, sp, %pcrel_lo(.Lpcrel_hi0) + + la t0, {exception_handler} // WARL=0 - direct mode combined handler + csrw stvec, t0 + + li ra, 0 + j {start} + + .Lkstart_crash: + jr x0 + ", + bss_test_zero = sym BSS_TEST_ZERO, + data_test_nonzero = sym DATA_TEST_NONZERO, + exception_handler = sym exception_handler, + stack = sym STACK, + stack_size = const size_of_val(&STACK), + start = sym start, + ); +} + +/// The entry to Rust, all things must be initialized +unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! { + unsafe { + let bootstrap = { + let args = args_ptr.read(); + + let dtb_data = if args.hwdesc_base != 0 { + Some(( + crate::PHYS_OFFSET + args.hwdesc_base as usize, + args.hwdesc_size as usize, + )) + } else { + None + }; + let dtb = args.dtb(); + + graphical_debug::init(args.env()); + + if let Some(dtb) = &dtb { + init_early(dtb); + } + + info!("Redox OS starting..."); + args.print(); + + if let Some(dtb) = &dtb { + device::dump_fdt(&dtb); + } + + // Initialize RMM + crate::startup::memory::init(&args, None, None); + + let boot_hart_id = + get_boot_hart_id(args.env()).expect("Didn't get boot HART id from bootloader"); + info!("Booting on HART {}", boot_hart_id); + BOOT_HART_ID.store(boot_hart_id, Ordering::Relaxed); + + paging::init(); + + crate::arch::misc::init(crate::cpu_set::LogicalCpuId::new(0)); + + // Setup kernel heap + allocator::init(); + + // Activate memory logging + crate::log::init(); + + crate::dtb::init(dtb_data); + + // Initialize devices + device::init(); + + // Initialize all of the non-core devices not otherwise needed to complete initialization + device::init_noncore(); + + // FIXME bringup AP HARTs + + args.bootstrap() + }; + + crate::startup::kmain(bootstrap); + } +} diff --git a/src/arch/riscv64/stop.rs b/src/arch/riscv64/stop.rs new file mode 100644 index 0000000000..920aa60793 --- /dev/null +++ b/src/arch/riscv64/stop.rs @@ -0,0 +1,15 @@ +use crate::sync::CleanLockToken; + +pub unsafe fn kreset() -> ! { + println!("kreset"); + unimplemented!() +} + +pub unsafe fn emergency_reset() -> ! { + unimplemented!() +} + +pub unsafe fn kstop(_token: &mut CleanLockToken) -> ! { + println!("kstop"); + unimplemented!() +} diff --git a/src/arch/riscv64/time.rs b/src/arch/riscv64/time.rs new file mode 100644 index 0000000000..f089112167 --- /dev/null +++ b/src/arch/riscv64/time.rs @@ -0,0 +1,34 @@ +use core::{ + arch::asm, + sync::atomic::{AtomicUsize, Ordering}, +}; + +use crate::sync::CleanLockToken; + +static MTIME_FREQ_HZ: AtomicUsize = AtomicUsize::new(0); + +pub fn init(freq_hz: usize) { + MTIME_FREQ_HZ.store(freq_hz, Ordering::Relaxed); +} + +pub fn monotonic_absolute(_token: &mut CleanLockToken) -> u128 { + let freq_hz = MTIME_FREQ_HZ.load(Ordering::Relaxed); + if freq_hz > 0 { + let counter: usize; + unsafe { + asm!( + "rdtime t0", + lateout("t0") counter + ); + }; + counter as u128 * 1_000_000_000u128 / freq_hz as u128 + } else { + 0 + } +} + +pub fn monotonic_resolution() -> u128 { + let freq_hz = MTIME_FREQ_HZ.load(Ordering::Relaxed); + + 1_000_000_000u128 / freq_hz as u128 +} diff --git a/src/arch/x86/consts.rs b/src/arch/x86/consts.rs new file mode 100644 index 0000000000..28c5073f5a --- /dev/null +++ b/src/arch/x86/consts.rs @@ -0,0 +1,20 @@ +// Because the memory map is so important to not be aliased, it is defined here, in one place +// The lower 256 PML4 entries are reserved for userspace +// Each PML4 entry references up to 512 GB of memory +// The second from the top (510) PML4 is reserved for the kernel + +// Framebuffer mapped by bootloader to 0xD000_0000 (128 MiB max) + +// Offset to APIC mappings (optional) +pub const LAPIC_OFFSET: usize = 0xD800_0000; +pub const IOAPIC_OFFSET: usize = LAPIC_OFFSET + 4096; +pub const HPET_OFFSET: usize = IOAPIC_OFFSET + 4096; + +/// Offset to kernel heap (256 MiB max) +#[inline(always)] +pub fn kernel_heap_offset() -> usize { + 0xE000_0000 +} + +/// End offset of the user image, i.e. kernel start +pub const USER_END_OFFSET: usize = 0x8000_0000; diff --git a/src/arch/x86/interrupt/handler.rs b/src/arch/x86/interrupt/handler.rs new file mode 100644 index 0000000000..40c347d028 --- /dev/null +++ b/src/arch/x86/interrupt/handler.rs @@ -0,0 +1,471 @@ +use crate::{arch::flags::FLAG_SINGLESTEP, memory::ArchIntCtx, panic, syscall::IntRegisters}; + +#[derive(Default)] +#[repr(C, packed)] +pub struct ScratchRegisters { + pub edx: usize, + pub ecx: usize, + pub eax: usize, +} + +impl ScratchRegisters { + pub fn dump(&self) { + println!("EAX: {:08x}", { self.eax }); + println!("ECX: {:08x}", { self.ecx }); + println!("EDX: {:08x}", { self.edx }); + } +} + +#[derive(Default)] +#[repr(C, packed)] +pub struct PreservedRegisters { + pub ebp: usize, + pub esi: usize, + pub edi: usize, + pub ebx: usize, +} + +impl PreservedRegisters { + pub fn dump(&self) { + println!("EBX: {:08x}", { self.ebx }); + println!("EDI: {:08x}", { self.edi }); + println!("ESI: {:08x}", { self.esi }); + println!("EBP: {:08x}", { self.ebp }); + } +} + +#[derive(Default)] +#[repr(C, packed)] +pub struct IretRegisters { + pub eip: usize, + pub cs: usize, + pub eflags: usize, + + // ---- + // The following will only be present if interrupt is raised from another + // privilege ring. Otherwise, they are undefined values. + // ---- + pub esp: usize, + pub ss: usize, +} + +impl IretRegisters { + pub fn dump(&self) { + println!("EFLAG: {:08x}", { self.eflags }); + println!("CS: {:08x}", { self.cs }); + println!("EIP: {:08x}", { self.eip }); + + if self.cs & 0b11 != 0b00 { + println!("ESP: {:08x}", { self.esp }); + println!("SS: {:08x}", { self.ss }); + } + } +} + +#[derive(Default)] +#[repr(C, packed)] +pub struct InterruptStack { + pub gs: usize, + pub preserved: PreservedRegisters, + pub scratch: ScratchRegisters, + pub iret: IretRegisters, +} + +impl InterruptStack { + pub fn init(&mut self) { + // Always enable interrupts! + self.iret.eflags = x86::bits32::eflags::EFlags::FLAGS_IF.bits() as usize; + self.iret.ss = (crate::arch::gdt::GDT_USER_DATA << 3) | 3; + self.iret.cs = (crate::arch::gdt::GDT_USER_CODE << 3) | 3; + self.gs = (crate::arch::gdt::GDT_USER_GS << 3) | 3; + } + pub fn dump(&self) { + self.iret.dump(); + self.scratch.dump(); + self.preserved.dump(); + } + pub fn trace(&self) { + self.dump(); + unsafe { + panic::user_stack_trace(&self); + panic::stack_trace(); + } + } + /// Saves all registers to a struct used by the proc: + /// scheme to read/write registers. + pub fn save(&self, all: &mut IntRegisters) { + all.ebp = self.preserved.ebp; + all.esi = self.preserved.esi; + all.edi = self.preserved.edi; + all.ebx = self.preserved.ebx; + all.edx = self.scratch.edx; + all.ecx = self.scratch.ecx; + all.eax = self.scratch.eax; + all.eip = self.iret.eip; + all.cs = self.iret.cs; + all.eflags = self.iret.eflags; + + // Set esp and ss: + + const CPL_MASK: usize = 0b11; + + let cs: usize; + unsafe { + core::arch::asm!("mov {}, cs", out(reg) cs); + } + + if self.iret.cs & CPL_MASK == cs & CPL_MASK { + // Privilege ring didn't change, so neither did the stack + all.esp = self as *const Self as usize // esp after Self was pushed to the stack + + size_of::() // disregard Self + - size_of::() * 2; // well, almost: esp and ss need to be excluded as they aren't present + unsafe { + core::arch::asm!("mov {}, ss", out(reg) all.ss); + } + } else { + all.esp = self.iret.esp; + all.ss = self.iret.ss; + } + } + pub fn frame_pointer(&self) -> usize { + self.preserved.ebp + } + pub fn stack_pointer(&self) -> usize { + self.iret.esp + } + pub fn set_stack_pointer(&mut self, esp: usize) { + self.iret.esp = esp; + } + pub fn instr_pointer(&self) -> usize { + self.iret.eip + } + pub fn sig_archdep_reg(&self) -> usize { + self.iret.eflags + } + pub fn set_instr_pointer(&mut self, eip: usize) { + self.iret.eip = eip; + } + /// Loads all registers from a struct used by the proc: + /// scheme to read/write registers. + pub fn load(&mut self, all: &IntRegisters) { + // TODO: Which of these should be allowed to change? + + self.preserved.ebp = all.ebp; + self.preserved.esi = all.esi; + self.preserved.edi = all.edi; + self.preserved.ebx = all.ebx; + self.scratch.edx = all.edx; + self.scratch.ecx = all.ecx; + self.scratch.eax = all.eax; + self.iret.eip = all.eip; + + // FIXME: The interrupt stack on which this is called, is always from userspace, but make + // the API safer. + self.iret.esp = all.esp; + + // OF, DF, 0, TF => D + // SF, ZF, 0, AF => D + // 0, PF, 1, CF => 5 + const ALLOWED_EFLAGS: usize = 0xDD5; + + // These should probably be restricted + // self.iret.cs = all.cs; + self.iret.eflags &= !ALLOWED_EFLAGS; + self.iret.eflags |= all.eflags & ALLOWED_EFLAGS; + } + /// Enables the "Trap Flag" in the FLAGS register, causing the CPU + /// to send a Debug exception after the next instruction. This is + /// used for singlestep in the proc: scheme. + pub fn set_singlestep(&mut self, enabled: bool) { + if enabled { + self.iret.eflags |= FLAG_SINGLESTEP; + } else { + self.iret.eflags &= !FLAG_SINGLESTEP; + } + } +} + +#[derive(Default)] +#[repr(C, packed)] +pub struct InterruptErrorStack { + pub code: usize, + pub inner: InterruptStack, +} + +impl InterruptErrorStack { + pub fn dump(&self) { + println!("CODE: {:08x}", { self.code }); + self.inner.dump(); + } + pub fn trace(&self) { + self.dump(); + unsafe { + panic::user_stack_trace(&self.inner); + panic::stack_trace(); + } + } +} + +#[macro_export] +macro_rules! push_scratch { + () => { + " + // Push scratch registers (minus eax) + push ecx + push edx + " + }; +} +#[macro_export] +macro_rules! pop_scratch { + () => { + " + // Pop scratch registers + pop edx + pop ecx + pop eax + " + }; +} + +#[macro_export] +macro_rules! push_preserved { + () => { + " + // Push preserved registers + push ebx + push edi + push esi + push ebp + " + }; +} +#[macro_export] +macro_rules! pop_preserved { + () => { + " + // Pop preserved registers + pop ebp + pop esi + pop edi + pop ebx + " + }; +} + +// Must always happen after push_scratch +macro_rules! enter_gs { + () => { + " + // Enter kernel GS segment + mov ecx, gs + push ecx + mov ecx, 0x18 + mov gs, ecx + " + }; +} + +// Must always happen before pop_scratch +macro_rules! exit_gs { + () => { + " + // Exit kernel GS segment + pop ecx + mov gs, ecx + " + }; +} + +#[macro_export] +macro_rules! interrupt_stack { + // XXX: Apparently we cannot use $expr and check for bool exhaustiveness, so we will have to + // use idents directly instead. + ($name:ident, |$stack:ident| $code:block) => { + #[unsafe(naked)] + pub unsafe extern "C" fn $name() { + unsafe extern "fastcall" fn inner($stack: &mut $crate::arch::x86::interrupt::InterruptStack) { + // TODO: Force the declarations to specify unsafe? + + #[allow(unused_unsafe)] + unsafe { + $code + } + } + core::arch::naked_asm!( + // Backup all userspace registers to stack + "push eax", + push_scratch!(), + push_preserved!(), + + // Enter kernel TLS segment + enter_gs!(), + + // TODO: Map PTI + // $crate::arch::x86::pti::map(); + + // Call inner function with pointer to stack + " + mov ecx, esp + call {inner} + ", + + // TODO: Unmap PTI + // $crate::arch::x86::pti::unmap(); + + // Exit kernel TLS segment + exit_gs!(), + + // Restore all userspace registers + pop_preserved!(), + pop_scratch!(), + + "iretd", + + inner = sym inner, + ); + } + }; + ($name:ident, |$stack:ident| $code:block) => { interrupt_stack!($name, |$stack| $code); }; + ($name:ident, @paranoid, |$stack:ident| $code:block) => { interrupt_stack!($name, |$stack| $code); } +} + +#[macro_export] +macro_rules! interrupt { + ($name:ident, || $code:block) => { + #[unsafe(naked)] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner() { + $code + } + + core::arch::naked_asm!( + // Backup all userspace registers to stack + "push eax", + push_scratch!(), + + // Enter kernel TLS segment + enter_gs!(), + + // TODO: Map PTI + // $crate::arch::x86::pti::map(); + + // Call inner function with pointer to stack + "call {inner}", + + // TODO: Unmap PTI + // $crate::arch::x86::pti::unmap(); + + // Exit kernel TLS segment + exit_gs!(), + + // Restore all userspace registers + pop_scratch!(), + + "iretd", + + inner = sym inner, + ); + } + }; +} + +#[macro_export] +macro_rules! interrupt_error { + ($name:ident, |$stack:ident, $error_code:ident| $code:block) => { + #[unsafe(naked)] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner($stack: &mut $crate::arch::x86::interrupt::handler::InterruptErrorStack) { + let $error_code: usize = $stack.code; + $code + } + + core::arch::naked_asm!( + // Move eax into code's place, put code in last instead (to be + // compatible with InterruptStack) + "xchg [esp], eax", + + // Push all userspace registers + push_scratch!(), + push_preserved!(), + + // Enter kernel TLS segment + enter_gs!(), + + // Put code in, it's now in eax + "push eax", + + // TODO: Map PTI + // $crate::arch::x86::pti::map(); + + // Call inner function with pointer to stack + " + push esp + call {inner} + ", + // add esp, 4 + + // TODO: Unmap PTI (split "add esp, 8" into two "add esp, 4"s maybe?) + // $crate::arch::x86::pti::unmap(); + + // Pop previous esp and code + "add esp, 8", + + // Exit kernel TLS segment + exit_gs!(), + + // Restore all userspace registers + pop_preserved!(), + pop_scratch!(), + + // The error code has already been popped, so use the regular macro. + "iretd", + inner = sym inner, + ); + } + }; +} +#[unsafe(naked)] +unsafe extern "C" fn usercopy_trampoline() { + core::arch::naked_asm!( + " + pop esi + pop edi + + mov eax, 1 + ret + " + ); +} + +impl ArchIntCtx for InterruptStack { + fn ip(&self) -> usize { + self.iret.eip + } + fn recover_and_efault(&mut self) { + // Unlike on x86_64, Protected Mode interrupts will not save/restore esp and ss unless + // privilege rings changed, which they won't here as we are catching a kernel-induced page + // fault. + // + // Thus, it is only possible to change scratch/preserved registers, and EIP. While it may + // be feasible to set ECX to zero to stop the REP MOVSB, or increase EIP by 2 (REP MOVSB is + // f3 a4, i.e. 2 bytes), this trampoline allows any memcpy implementation, that reasonably + // pushes preserved registers to the stack. + self.iret.eip = usercopy_trampoline as usize; + } +} + +#[unsafe(naked)] +pub unsafe extern "C" fn enter_usermode() { + core::arch::naked_asm!( + // TODO: Unmap PTI + // $crate::arch::x86::pti::unmap(); + + // Exit kernel TLS segment + exit_gs!(), + // Restore all userspace registers + pop_preserved!(), + pop_scratch!(), + "iretd", + ) +} diff --git a/src/arch/x86/interrupt/mod.rs b/src/arch/x86/interrupt/mod.rs new file mode 100644 index 0000000000..efc4999262 --- /dev/null +++ b/src/arch/x86/interrupt/mod.rs @@ -0,0 +1,10 @@ +//! Interrupt instructions + +pub use crate::arch::x86_shared::interrupt::*; + +#[macro_use] +pub mod handler; + +pub mod syscall; + +pub use self::handler::InterruptStack; diff --git a/src/arch/x86/interrupt/syscall.rs b/src/arch/x86/interrupt/syscall.rs new file mode 100644 index 0000000000..22895c11ac --- /dev/null +++ b/src/arch/x86/interrupt/syscall.rs @@ -0,0 +1,48 @@ +use crate::{ + ptrace, + sync::CleanLockToken, + syscall, + syscall::flag::{PTRACE_FLAG_IGNORE, PTRACE_STOP_POST_SYSCALL, PTRACE_STOP_PRE_SYSCALL}, +}; + +pub unsafe fn init() {} + +macro_rules! with_interrupt_stack { + (|$stack:ident, $token:ident| $code:block) => {{ + let mut $token = CleanLockToken::new(); + + let allowed = ptrace::breakpoint_callback(PTRACE_STOP_PRE_SYSCALL, None, &mut $token) + .and_then(|_| ptrace::next_breakpoint().map(|f| !f.contains(PTRACE_FLAG_IGNORE))); + + if allowed.unwrap_or(true) { + // If the syscall is `clone`, the clone won't return here. Instead, + // it'll return early and leave any undropped values. This is + // actually GOOD, because any references are at that point UB + // anyway, because they are based on the wrong stack. + let $stack = &mut *$stack; + $code + } + + ptrace::breakpoint_callback(PTRACE_STOP_POST_SYSCALL, None, &mut $token); + }}; +} + +interrupt_stack!(syscall, |stack| { + with_interrupt_stack!(|stack, token| { + let scratch = &stack.scratch; + let preserved = &stack.preserved; + let ret = syscall::syscall( + scratch.eax, + preserved.ebx, + scratch.ecx, + scratch.edx, + preserved.esi, + preserved.edi, + preserved.ebp, + &mut token, + ); + stack.scratch.eax = ret; + }) +}); + +pub use super::handler::enter_usermode; diff --git a/src/arch/x86/mod.rs b/src/arch/x86/mod.rs new file mode 100644 index 0000000000..025c577319 --- /dev/null +++ b/src/arch/x86/mod.rs @@ -0,0 +1,41 @@ +pub use crate::arch::x86_shared::*; + +/// Constants like memory locations +pub mod consts; + +/// Interrupt instructions +#[macro_use] +pub mod interrupt; + +#[unsafe(naked)] +pub unsafe extern "C" fn arch_copy_to_user(dst: usize, src: usize, len: usize) -> u8 { + core::arch::naked_asm!( + " + .global __usercopy_start + __usercopy_start: + push edi + push esi + + mov edi, [esp + 12] # dst + mov esi, [esp + 16] # src + mov ecx, [esp + 20] # len + rep movsb + + pop esi + pop edi + + xor eax, eax + ret + .global __usercopy_end + __usercopy_end: + " + ); +} +pub use arch_copy_to_user as arch_copy_from_user; + +pub const KFX_SIZE: usize = 512; + +// This function exists as the KFX size is dynamic on x86_64. +pub fn kfx_size() -> usize { + KFX_SIZE +} diff --git a/src/arch/x86_64/alternative.rs b/src/arch/x86_64/alternative.rs new file mode 100644 index 0000000000..b10177d79d --- /dev/null +++ b/src/arch/x86_64/alternative.rs @@ -0,0 +1,317 @@ +#![allow(unused_imports)] + +use spin::Once; +use x86::controlregs::{Cr4, Xcr0}; + +use crate::{ + arch::cpuid::{cpuid, feature_info, has_ext_feat}, + context::memory::PageSpan, + memory::{KernelMapper, Page, PageFlags, VirtualAddress, PAGE_SIZE}, +}; + +#[cfg(all(cpu_feature_never = "xsave", not(cpu_feature_never = "xsaveopt")))] +compile_error!("cannot force-disable xsave without force-disabling xsaveopt"); + +#[repr(C)] +#[derive(Clone, Copy, Debug)] +pub struct AltReloc { + // These two fields point to a utf-8 name of the feature, see the match statement below. + pub name_start: *const u8, + pub name_len: usize, + + // Base address of the code that may later be overwritten. + pub code_start: *mut u8, + // Length of the default code, excluding NOPs if the altcode sequence is longer. + pub origcode_len: usize, + // Actual length of the overwritable code, i.e. max(origcode_len, altcode_len). + pub padded_len: usize, + pub _rsvd: usize, + + // These two fields point to the alternative code (in .rodata), and possible new nop bytes, + // that will replace the code_start..+padded_len + pub altcode_start: *const u8, + pub altcode_len: usize, +} + +#[cold] +pub unsafe fn early_init(bsp: bool) { + unsafe { + let relocs_offset = crate::kernel_executable_offsets::__altrelocs_start(); + // __altrelocs_end > __altrelocs_start so this cannot overflow + #[expect(clippy::arithmetic_side_effects)] + let relocs_size = crate::kernel_executable_offsets::__altrelocs_end() - relocs_offset; + + // AltReloc is not a ZST so the modulo and division will never panic + #[expect(clippy::arithmetic_side_effects)] + { + assert_eq!(relocs_size % size_of::(), 0) + } + #[expect(clippy::arithmetic_side_effects)] + let relocs = core::slice::from_raw_parts( + relocs_offset as *const AltReloc, + relocs_size / size_of::(), + ); + + let mut enable = KcpuFeatures::empty(); + + if cfg!(not(cpu_feature_never = "smap")) && has_ext_feat(|feat| feat.has_smap()) { + // SMAP (Supervisor-Mode Access Prevention) forbids the kernel from accessing any + // userspace-accessible pages, with the necessary exception of when RFLAGS.AC = 1. This + // limits user-memory accesses to the UserSlice wrapper, so that no data outside of + // usercopy functions can be accidentally accessed by the kernel. + x86::controlregs::cr4_write(x86::controlregs::cr4() | Cr4::CR4_ENABLE_SMAP); + // Clear CLAC in (the probably unlikely) case the bootloader set it earlier. + x86::bits64::rflags::clac(); + + enable |= KcpuFeatures::SMAP; + } else { + assert!(cfg!(not(cpu_feature_always = "smap"))); + } + + if cfg!(not(cpu_feature_never = "fsgsbase")) + && let Some(f) = cpuid().get_extended_feature_info() + && f.has_fsgsbase() + { + x86::controlregs::cr4_write( + x86::controlregs::cr4() | x86::controlregs::Cr4::CR4_ENABLE_FSGSBASE, + ); + + enable |= KcpuFeatures::FSGSBASE; + } else { + assert!(cfg!(not(cpu_feature_always = "fsgsbase"))); + } + + #[cfg(not(cpu_feature_never = "xsave"))] + if feature_info().has_xsave() { + use raw_cpuid::{ExtendedRegisterStateLocation, ExtendedRegisterType}; + + x86::controlregs::cr4_write( + x86::controlregs::cr4() | x86::controlregs::Cr4::CR4_ENABLE_OS_XSAVE, + ); + + let mut xcr0 = Xcr0::XCR0_FPU_MMX_STATE | Xcr0::XCR0_SSE_STATE; + x86::controlregs::xcr0_write(xcr0); + let ext_state_info = cpuid() + .get_extended_state_info() + .expect("must be present if XSAVE is supported"); + + enable |= KcpuFeatures::XSAVE; + enable.set(KcpuFeatures::XSAVEOPT, ext_state_info.has_xsaveopt()); + + let info = xsave::XsaveInfo { + ymm_upper_offset: feature_info().has_avx().then(|| { + xcr0 |= Xcr0::XCR0_AVX_STATE; + x86::controlregs::xcr0_write(xcr0); + + let state = ext_state_info + .iter() + .find(|state| { + state.register() == ExtendedRegisterType::Avx + && state.location() == ExtendedRegisterStateLocation::Xcr0 + }) + .expect("CPUID said AVX was supported but there's no state info"); + + // 16 * size_of::() is well below usize::MAX + #[expect(clippy::arithmetic_side_effects)] + if state.size() as usize != 16 * size_of::() { + warn!("Unusual AVX state size {}", state.size()); + } + + state.offset() + }), + xsave_size: ext_state_info.xsave_area_size_enabled_features(), + }; + debug!("XSAVE: {:?}", info); + + xsave::XSAVE_INFO.call_once(|| info); + } else { + assert!(cfg!(not(cpu_feature_always = "xsave"))); + } + + if !bsp { + return; + } + + overwrite(relocs, enable); + + if cfg!(not(feature = "self_modifying")) { + assert!( + cfg!(not(cpu_feature_auto = "smap")) + && cfg!(not(cpu_feature_auto = "fsgsbase")) + && cfg!(not(cpu_feature_auto = "xsave")) + && cfg!(not(cpu_feature_auto = "xsaveopt")) + ); + } + + FEATURES.call_once(|| enable); + } +} + +unsafe fn overwrite(relocs: &[AltReloc], enable: KcpuFeatures) { + unsafe { + if cfg!(not(feature = "self_modifying")) { + return; + } + + debug!("self-modifying features: {:?}", enable); + + let mut mapper = KernelMapper::lock_rw(); + for reloc in relocs.iter().copied() { + let name = core::str::from_utf8(core::slice::from_raw_parts( + reloc.name_start, + reloc.name_len, + )) + .expect("invalid feature name"); + let altcode = core::slice::from_raw_parts(reloc.altcode_start, reloc.altcode_len); + + let dst_pages = PageSpan::between( + Page::containing_address(VirtualAddress::new(reloc.code_start as usize)), + Page::containing_address(VirtualAddress::new( + (reloc.code_start as usize + reloc.padded_len).next_multiple_of(PAGE_SIZE), + )), + ); + for page in dst_pages.pages() { + mapper + .remap( + page.start_address(), + PageFlags::new().write(true).execute(true).global(true), + ) + .unwrap() + .flush(); + } + + let code = core::slice::from_raw_parts_mut(reloc.code_start, reloc.padded_len); + + trace!( + "feature {} current {:x?} altcode {:x?}", + name, + code, + altcode + ); + + let feature_is_enabled = match name { + "smap" => enable.contains(KcpuFeatures::SMAP), + "fsgsbase" => enable.contains(KcpuFeatures::FSGSBASE), + "xsave" => enable.contains(KcpuFeatures::XSAVE), + "xsaveopt" => enable.contains(KcpuFeatures::XSAVEOPT), + //_ => panic!("unknown altcode relocation: {}", name), + _ => true, + }; + + // XXX: The `.nops` directive only works for constant lengths, and the variable `.skip -X` + // only outputs the (slower) single-byte 0x90 NOP. + + // This table is from the "Software Optimization Guide for AMD Family 19h Processors" (November + // 2020). + const NOPS_TABLE: [&[u8]; 11] = [ + &[0x90], + &[0x66, 0x90], + &[0x0f, 0x1f, 0x00], + &[0x0f, 0x1f, 0x40, 0x00], + &[0x0f, 0x1f, 0x44, 0x00, 0x00], + &[0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00], + &[0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00], + &[0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00], + &[0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00], + &[0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00], + &[ + 0x66, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + ]; + + if feature_is_enabled { + trace!("feature {} origcode {:x?}", name, code); + let (dst, dst_nops) = code.split_at_mut(altcode.len()); + dst.copy_from_slice(altcode); + + for chunk in dst_nops.chunks_mut(NOPS_TABLE.len()) { + // `chunk.len() - 1` is always in bounds because we are chunking by + // `NOPS_TABLE.len()` + #[expect(clippy::indexing_slicing)] + // `chunk.len()` will never be 0 + #[expect(clippy::arithmetic_side_effects)] + chunk.copy_from_slice(NOPS_TABLE[chunk.len() - 1]); + } + trace!("feature {} new {:x?} altcode {:x?}", name, code, altcode); + } else { + trace!("feature !{} origcode {:x?}", name, code); + let (_, padded) = code.split_at_mut(reloc.origcode_len); + + // Not strictly necessary, but reduces the number of instructions using longer nop + // instructions. + for chunk in padded.chunks_mut(NOPS_TABLE.len()) { + // `chunk.len() - 1` is always in bounds because we are chunking by + // `NOPS_TABLE.len()` + #[expect(clippy::indexing_slicing)] + // `chunk.len()` will never be 0 + #[expect(clippy::arithmetic_side_effects)] + chunk.copy_from_slice(NOPS_TABLE[chunk.len() - 1]); + } + + trace!("feature !{} new {:x?}", name, code); + } + + for page in dst_pages.pages() { + mapper + .remap( + page.start_address(), + PageFlags::new().write(false).execute(true).global(true), + ) + .unwrap() + .flush(); + } + } + } +} + +bitflags! { + #[derive(Clone, Copy, Debug)] + pub struct KcpuFeatures: usize { + const SMAP = 1; + const FSGSBASE = 2; + const XSAVE = 4; + const XSAVEOPT = 8; + } +} + +static FEATURES: Once = Once::new(); + +pub fn features() -> KcpuFeatures { + *FEATURES.get().expect("early_cpu_init was not called") +} + +#[cfg(not(cpu_feature_never = "xsave"))] +mod xsave { + use spin::Once; + + #[derive(Debug)] + pub struct XsaveInfo { + pub ymm_upper_offset: Option, + pub xsave_size: u32, + } + pub(super) static XSAVE_INFO: Once = Once::new(); + + pub fn info() -> Option<&'static XsaveInfo> { + XSAVE_INFO.get() + } +} + +pub fn kfx_size() -> usize { + #[cfg(not(cpu_feature_never = "xsave"))] + { + // This wont overflow + #[expect(clippy::arithmetic_side_effects)] + match xsave::info() { + Some(info) => FXSAVE_SIZE + XSAVE_HEADER_SIZE + info.xsave_size as usize, + None => FXSAVE_SIZE, + } + } + #[cfg(cpu_feature_never = "xsave")] + { + // FXSAVE size + FXSAVE_SIZE + } +} + +pub const FXSAVE_SIZE: usize = 512; +pub const XSAVE_HEADER_SIZE: usize = 64; diff --git a/src/arch/x86_64/consts.rs b/src/arch/x86_64/consts.rs new file mode 100644 index 0000000000..547b9bfadb --- /dev/null +++ b/src/arch/x86_64/consts.rs @@ -0,0 +1,26 @@ +// Because the memory map is so important to not be aliased, it is defined here, in one place. +// +// - The lower half (256 PML4 entries; 128 TiB) is reserved for userspace. These mappings are +// associated with _address spaces_, and change when context switching, unless the address spaces +// match. +// - The upper half is reserved for the kernel. Kernel mappings are preserved across context +// switches. +// +// Each PML4 entry references 512 GiB of virtual memory. + +/// The size of a single PML4 +pub const PML4_SIZE: usize = 0x0000_0080_0000_0000; + +/// Offset of kernel +const KERNEL_OFFSET: usize = (1_usize << 31).wrapping_neg(); + +/// Offset to kernel heap +#[inline(always)] +pub fn kernel_heap_offset() -> usize { + crate::kernel_executable_offsets::KERNEL_OFFSET() - PML4_SIZE +} + +/// End offset of the user image, i.e. kernel start +// TODO: Make this offset at least PAGE_SIZE less? There are known hardware bugs on some arches, +// for example on x86 if instructions execute near the 48-bit canonical address boundary. +pub const USER_END_OFFSET: usize = 256 * PML4_SIZE; diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs new file mode 100644 index 0000000000..676747ed16 --- /dev/null +++ b/src/arch/x86_64/interrupt/handler.rs @@ -0,0 +1,532 @@ +use crate::{arch::flags::FLAG_SINGLESTEP, memory::ArchIntCtx, panic, syscall::IntRegisters}; + +#[derive(Default)] +#[repr(C)] +pub struct ScratchRegisters { + pub r11: usize, + pub r10: usize, + pub r9: usize, + pub r8: usize, + pub rsi: usize, + pub rdi: usize, + pub rdx: usize, + pub rcx: usize, + pub rax: usize, +} + +impl ScratchRegisters { + pub fn dump(&self) { + println!("RAX: {:016x}", { self.rax }); + println!("RCX: {:016x}", { self.rcx }); + println!("RDX: {:016x}", { self.rdx }); + println!("RDI: {:016x}", { self.rdi }); + println!("RSI: {:016x}", { self.rsi }); + println!("R8: {:016x}", { self.r8 }); + println!("R9: {:016x}", { self.r9 }); + println!("R10: {:016x}", { self.r10 }); + println!("R11: {:016x}", { self.r11 }); + } +} + +#[derive(Default)] +#[repr(C)] +pub struct PreservedRegisters { + pub r15: usize, + pub r14: usize, + pub r13: usize, + pub r12: usize, + pub rbp: usize, + pub rbx: usize, +} + +impl PreservedRegisters { + pub fn dump(&self) { + println!("RBX: {:016x}", { self.rbx }); + println!("RBP: {:016x}", { self.rbp }); + println!("R12: {:016x}", { self.r12 }); + println!("R13: {:016x}", { self.r13 }); + println!("R14: {:016x}", { self.r14 }); + println!("R15: {:016x}", { self.r15 }); + } +} + +#[derive(Default)] +#[repr(C)] +pub struct IretRegisters { + pub rip: usize, + pub cs: usize, + pub rflags: usize, + + // In x86 Protected Mode, i.e. 32-bit kernels, the following two registers are conditionally + // pushed if the privilege ring changes. In x86 Long Mode however, i.e. 64-bit kernels, they + // are unconditionally pushed, mostly due to stack alignment requirements. + pub rsp: usize, + pub ss: usize, +} + +impl IretRegisters { + pub fn dump(&self) { + println!("RFLAG: {:016x}", { self.rflags }); + println!("CS: {:016x}", { self.cs }); + println!("RIP: {:016x}", { self.rip }); + + println!("RSP: {:016x}", { self.rsp }); + println!("SS: {:016x}", { self.ss }); + + unsafe { + let fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE); + let gsbase = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE); + let kgsbase = x86::msr::rdmsr(x86::msr::IA32_GS_BASE); + println!( + "FSBASE {:016x}\nGSBASE {:016x}\nKGSBASE {:016x}", + fsbase, gsbase, kgsbase + ); + } + } +} + +#[derive(Default)] +#[repr(C)] +pub struct InterruptStack { + pub preserved: PreservedRegisters, + pub scratch: ScratchRegisters, + pub iret: IretRegisters, +} + +impl InterruptStack { + pub fn init(&mut self) { + // Always enable interrupts! + self.iret.rflags = x86::bits64::rflags::RFlags::FLAGS_IF.bits() as usize; + self.iret.cs = (crate::arch::gdt::GDT_USER_CODE << 3) | 3; + self.iret.ss = (crate::arch::gdt::GDT_USER_DATA << 3) | 3; + } + pub fn frame_pointer(&self) -> usize { + self.preserved.rbp + } + pub fn stack_pointer(&self) -> usize { + self.iret.rsp + } + pub fn set_stack_pointer(&mut self, rsp: usize) { + self.iret.rsp = rsp; + } + pub fn instr_pointer(&self) -> usize { + self.iret.rip + } + pub fn sig_archdep_reg(&self) -> usize { + self.iret.rflags + } + pub fn set_instr_pointer(&mut self, rip: usize) { + self.iret.rip = rip; + } + pub fn set_arg1(&mut self, arg_opt: Option) { + if let Some(arg) = arg_opt { + self.scratch.rsi = arg; + } + } + pub fn dump(&self) { + self.iret.dump(); + self.scratch.dump(); + self.preserved.dump(); + } + pub fn trace(&self) { + self.dump(); + unsafe { + panic::user_stack_trace(self); + panic::stack_trace(); + } + } + /// Saves all registers to a struct used by the proc: + /// scheme to read/write registers. + pub fn save(&self, all: &mut IntRegisters) { + all.r15 = self.preserved.r15; + all.r14 = self.preserved.r14; + all.r13 = self.preserved.r13; + all.r12 = self.preserved.r12; + all.rbp = self.preserved.rbp; + all.rbx = self.preserved.rbx; + all.r11 = self.scratch.r11; + all.r10 = self.scratch.r10; + all.r9 = self.scratch.r9; + all.r8 = self.scratch.r8; + all.rsi = self.scratch.rsi; + all.rdi = self.scratch.rdi; + all.rdx = self.scratch.rdx; + all.rcx = self.scratch.rcx; + all.rax = self.scratch.rax; + all.rip = self.iret.rip; + all.cs = self.iret.cs; + all.rflags = self.iret.rflags; + all.rsp = self.iret.rsp; + all.ss = self.iret.ss; + } + /// Loads all registers from a struct used by the proc: + /// scheme to read/write registers. + pub fn load(&mut self, all: &IntRegisters) { + self.preserved.r15 = all.r15; + self.preserved.r14 = all.r14; + self.preserved.r13 = all.r13; + self.preserved.r12 = all.r12; + self.preserved.rbp = all.rbp; + self.preserved.rbx = all.rbx; + self.scratch.r11 = all.r11; + self.scratch.r10 = all.r10; + self.scratch.r9 = all.r9; + self.scratch.r8 = all.r8; + self.scratch.rsi = all.rsi; + self.scratch.rdi = all.rdi; + self.scratch.rdx = all.rdx; + self.scratch.rcx = all.rcx; + self.scratch.rax = all.rax; + self.iret.rip = all.rip; + self.iret.rsp = all.rsp; + + // CS and SS are immutable, at least their privilege levels. + + // OF, DF, 0, TF => D + // SF, ZF, 0, AF => D + // 0, PF, 1, CF => 5 + const ALLOWED_RFLAGS: usize = 0xDD5; + + self.iret.rflags &= !ALLOWED_RFLAGS; + self.iret.rflags |= all.rflags & ALLOWED_RFLAGS; + } + /// Enables the "Trap Flag" in the FLAGS register, causing the CPU + /// to send a Debug exception after the next instruction. This is + /// used for singlestep in the proc: scheme. + pub fn set_singlestep(&mut self, enabled: bool) { + if enabled { + self.iret.rflags |= FLAG_SINGLESTEP; + } else { + self.iret.rflags &= !FLAG_SINGLESTEP; + } + } +} + +#[macro_export] +macro_rules! push_scratch { + () => { + " + // Push scratch registers + push rcx + push rdx + push rdi + push rsi + push r8 + push r9 + push r10 + push r11 + " + }; +} +#[macro_export] +macro_rules! pop_scratch { + () => { + " + // Pop scratch registers + pop r11 + pop r10 + pop r9 + pop r8 + pop rsi + pop rdi + pop rdx + pop rcx + pop rax + " + }; +} + +#[macro_export] +macro_rules! push_preserved { + () => { + " + // Push preserved registers + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + " + }; +} +#[macro_export] +macro_rules! pop_preserved { + () => { + " + // Pop preserved registers + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + " + }; +} +macro_rules! swapgs_iff_ring3_fast { + // TODO: Spectre V1: LFENCE? + () => { + " + // Check whether the last two bits RSP+8 (code segment) are equal to zero. + test QWORD PTR [rsp + 8], 0x3 + // Skip the SWAPGS instruction if CS & 0b11 == 0b00. + jz 2f + swapgs + 2: + " + }; +} +macro_rules! swapgs_iff_ring3_fast_errorcode { + // TODO: Spectre V1: LFENCE? + () => { + " + test QWORD PTR [rsp + 16], 0x3 + jz 2f + swapgs + 2: + " + }; +} + +macro_rules! conditional_swapgs_paranoid { + // For regular interrupt handlers and the syscall handler, managing IA32_GS_BASE and + // IA32_KERNEL_GS_BASE (the "GSBASE registers") is more or less trivial when using the SWAPGS + // instruction. + // + // The syscall handler simply runs SWAPGS, as syscalls can only originate from usermode, + // whereas interrupt handlers conditionally SWAPGS unless the interrupt was triggered from + // kernel mode, in which case the "swap state" is already valid, and there is no need to + // SWAPGS. + // + // Handling GSBASE correctly for paranoid interrupts however, is not as simple. NMIs can occur + // between the check of whether an interrupt came from usermode, and the actual SWAPGS + // instruction. #DB can also be triggered inside of a kernel interrupt handler, due to + // breakpoints, even though setting up such breakpoints in the first place, is not yet + // supported by the kernel. + // + // Luckily, the GDT always resides in the PCR (at least after init_paging, but there are no + // interrupt handlers set up before that), allowing GSBASE to be calculated relatively cheaply. + // Out of the two GSBASE registers, at least one must be *the* kernel GSBASE, allowing for a + // simple conditional SWAPGS. + // + // (An alternative to conditionally executing SWAPGS, would be to save and restore GSBASE via + // e.g. the stack. That would nonetheless require saving and restoring both GSBASE registers, + // if the interrupt handler should be allowed to context switch, which the current #DB handler + // may do.) + // + // TODO: Handle nested NMIs like Linux does (https://lwn.net/Articles/484932/)?. + + () => { concat!( + // Put the GDT base pointer in RDI. + " + sub rsp, 16 + sgdt [rsp + 6] + mov rdi, [rsp + 8] + add rsp, 16 + ", + // Calculate the PCR address by subtracting the offset of the GDT in the PCR struct. + "sub rdi, {PCR_GDT_OFFSET};", + + // Read the current IA32_GS_BASE value into RDX. + alternative!( + feature: "fsgsbase", + then: ["rdgsbase rdx"], + default: [" + mov ecx, {IA32_GS_BASE} + rdmsr + shl rdx, 32 + or rdx, rax + "] + ), + + // If they were not equal, the PCR address must instead be in IA32_KERNEL_GS_BASE, + // requiring a SWAPGS. GSBASE needs to be swapped back, so store the same flag in RBX. + + // TODO: Spectre V1: LFENCE? + " + cmp rdx, rdi + sete bl + je 2f + swapgs + 2: + ", + ) } +} +macro_rules! conditional_swapgs_back_paranoid { + () => { + " + test bl, bl + jnz 2f + swapgs + 2: + " + }; +} +macro_rules! nop { + () => { + " + // Unused: {IA32_GS_BASE} {PCR_GDT_OFFSET} + " + }; +} + +#[macro_export] +macro_rules! interrupt_stack { + // XXX: Apparently we cannot use $expr and check for bool exhaustiveness, so we will have to + // use idents directly instead. + ($name:ident, $save1:ident!, $save2:ident!, $rstor2:ident!, $rstor1:ident!, is_paranoid: $is_paranoid:expr_2021, |$stack:ident| $code:block) => { + #[unsafe(naked)] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner($stack: &mut $crate::arch::x86_64::interrupt::InterruptStack) { + $code + } + core::arch::naked_asm!( + // Clear direction flag, required by ABI when running any Rust code in the kernel. + "cld;", + + // Backup all userspace registers to stack + $save1!(), + "push rax", + push_scratch!(), + push_preserved!(), + + $save2!(), + + // TODO: Map PTI + // $crate::arch::x86_64::pti::map(); + + // Call inner function with pointer to stack + " + mov rdi, rsp + call {inner} + ", + + // TODO: Unmap PTI + // $crate::arch::x86_64::pti::unmap(); + + $rstor2!(), + + // Restore all userspace registers + pop_preserved!(), + pop_scratch!(), + + $rstor1!(), + "iretq", + + inner = sym inner, + IA32_GS_BASE = const(x86::msr::IA32_GS_BASE), + + PCR_GDT_OFFSET = const(core::mem::offset_of!($crate::arch::gdt::ProcessorControlRegion, gdt)), + ); + } + }; + ($name:ident, |$stack:ident| $code:block) => { interrupt_stack!($name, swapgs_iff_ring3_fast!, nop!, nop!, swapgs_iff_ring3_fast!, is_paranoid: false, |$stack| $code); }; + ($name:ident, @paranoid, |$stack:ident| $code:block) => { interrupt_stack!($name, nop!, conditional_swapgs_paranoid!, conditional_swapgs_back_paranoid!, nop!, is_paranoid: true, |$stack| $code); } +} + +#[macro_export] +macro_rules! interrupt { + ($name:ident, || $code:block) => { + #[unsafe(naked)] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner() { + $code + } + + core::arch::naked_asm!( + // Clear direction flag, required by ABI when running any Rust code in the kernel. + "cld;", + + // Backup all userspace registers to stack + swapgs_iff_ring3_fast!(), + "push rax", + push_scratch!(), + + // TODO: Map PTI + // $crate::arch::x86_64::pti::map(); + + // Call inner function with pointer to stack + "call {inner}", + + // TODO: Unmap PTI + // $crate::arch::x86_64::pti::unmap(); + + // Restore all userspace registers + pop_scratch!(), + + swapgs_iff_ring3_fast!(), + "iretq", + + inner = sym inner, + ); + } + }; +} + +#[macro_export] +macro_rules! interrupt_error { + ($name:ident, |$stack:ident, $error_code:ident| $code:block) => { + #[unsafe(naked)] + pub unsafe extern "C" fn $name() { + unsafe extern "C" fn inner($stack: &mut $crate::arch::x86_64::interrupt::handler::InterruptStack, $error_code: usize) { + $code + } + + core::arch::naked_asm!( + // Clear direction flag, required by ABI when running any Rust code in the kernel. + "cld;", + + swapgs_iff_ring3_fast_errorcode!(), + + // Don't push RAX yet, as the error code is already stored in RAX's position. + + // Push all userspace registers + push_scratch!(), + push_preserved!(), + + // Now that we have a couple of usable registers, put the error code in the second + // argument register for the inner function, and save RAX where it would normally + // be. + "mov rsi, [rsp + {rax_offset}];", + "mov [rsp + {rax_offset}], rax;", + + // TODO: Map PTI + // $crate::arch::x86_64::pti::map(); + + // Call inner function with pointer to stack, and error code. + "mov rdi, rsp;", + "call {inner};", + + // TODO: Unmap PTI + // $crate::arch::x86_64::pti::unmap(); + + // Restore all userspace registers + pop_preserved!(), + pop_scratch!(), + + // The error code has already been popped, so use the regular macro. + swapgs_iff_ring3_fast!(), + "iretq;", + + inner = sym inner, + rax_offset = const(size_of::<$crate::arch::interrupt::handler::PreservedRegisters>() + size_of::<$crate::arch::interrupt::handler::ScratchRegisters>() - 8), + ); + } + }; +} + +impl ArchIntCtx for InterruptStack { + fn ip(&self) -> usize { + self.iret.rip + } + fn recover_and_efault(&mut self) { + // We were inside a usercopy function that failed. This is handled by setting rax to a + // nonzero value, and emulating the ret instruction. + self.scratch.rax = 1; + let ret_addr = unsafe { (self.iret.rsp as *const usize).read() }; + self.iret.rsp += 8; + self.iret.rip = ret_addr; + self.iret.rflags &= !(1 << 18); + } +} diff --git a/src/arch/x86_64/interrupt/mod.rs b/src/arch/x86_64/interrupt/mod.rs new file mode 100644 index 0000000000..efc4999262 --- /dev/null +++ b/src/arch/x86_64/interrupt/mod.rs @@ -0,0 +1,10 @@ +//! Interrupt instructions + +pub use crate::arch::x86_shared::interrupt::*; + +#[macro_use] +pub mod handler; + +pub mod syscall; + +pub use self::handler::InterruptStack; diff --git a/src/arch/x86_64/interrupt/syscall.rs b/src/arch/x86_64/interrupt/syscall.rs new file mode 100644 index 0000000000..d70e13534f --- /dev/null +++ b/src/arch/x86_64/interrupt/syscall.rs @@ -0,0 +1,195 @@ +use crate::{ + arch::{gdt, interrupt::InterruptStack}, + ptrace, + sync::CleanLockToken, + syscall, + syscall::flag::{PTRACE_FLAG_IGNORE, PTRACE_STOP_POST_SYSCALL, PTRACE_STOP_PRE_SYSCALL}, +}; +use core::mem::offset_of; +use x86::{ + bits64::{rflags::RFlags, task::TaskStateSegment}, + msr, + segmentation::SegmentSelector, +}; + +pub unsafe fn init() { + unsafe { + // IA32_STAR[31:0] are reserved. + + // The base selector of the two consecutive segments for kernel code and the immediately + // succeeding stack (data). + let syscall_cs_ss_base = (gdt::GDT_KERNEL_CODE as u16) << 3; + // The base selector of the three consecutive segments (of which two are used) for user code + // and user data. It points to a 32-bit code segment, which must be followed by a data segment + // (stack), and a 64-bit code segment. + let sysret_cs_ss_base = ((gdt::GDT_USER_CODE32_UNUSED as u16) << 3) | 3; + let star_high = u32::from(syscall_cs_ss_base) | (u32::from(sysret_cs_ss_base) << 16); + + msr::wrmsr(msr::IA32_STAR, u64::from(star_high) << 32); + #[expect(clippy::fn_to_numeric_cast)] + msr::wrmsr(msr::IA32_LSTAR, syscall_instruction as u64); + + // DF needs to be cleared, required by the compiler ABI. If DF were not part of FMASK, + // userspace would be able to reverse the direction of in-kernel REP MOVS/STOS/(CMPS/SCAS), and + // cause all sorts of memory corruption. + // + // IF needs to be cleared, as the kernel currently assumes interrupts are disabled except in + // usermode and in kmain. + // + // TF needs to be cleared, as enabling userspace-rflags-controlled singlestep in the kernel + // would be a bad idea. + // + // AC it should always be cleared when entering the kernel (and never be set except in usercopy + // functions), if for some reason AC was set before entering userspace (AC can only be modified + // by kernel code). + // + // The other flags could indeed be preserved and excluded from FMASK, but since they are not + // used to pass data to the kernel, they might as well be masked with *marginal* security + // benefits. + // + // Flags not included here are IOPL (not relevant to the kernel at all), "CPUID flag" (not used + // at all in 64-bit mode), RF (not used yet, but DR breakpoints would remain enabled both in + // user and kernel mode), VM8086 (not used at all), and VIF/VIP (system-level status flags?). + + let mask_critical = + RFlags::FLAGS_DF | RFlags::FLAGS_IF | RFlags::FLAGS_TF | RFlags::FLAGS_AC; + let mask_other = RFlags::FLAGS_CF + | RFlags::FLAGS_PF + | RFlags::FLAGS_AF + | RFlags::FLAGS_ZF + | RFlags::FLAGS_SF + | RFlags::FLAGS_OF; + msr::wrmsr(msr::IA32_FMASK, (mask_critical | mask_other).bits()); + + let efer = msr::rdmsr(msr::IA32_EFER); + msr::wrmsr(msr::IA32_EFER, efer | 1); + } +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn __inner_syscall_instruction(stack: *mut InterruptStack) { + unsafe { + let mut token = CleanLockToken::new(); + let allowed = ptrace::breakpoint_callback(PTRACE_STOP_PRE_SYSCALL, None, &mut token) + .and_then(|_| ptrace::next_breakpoint().map(|f| !f.contains(PTRACE_FLAG_IGNORE))); + + if allowed.unwrap_or(true) { + let scratch = &(*stack).scratch; + + let ret = syscall::syscall( + scratch.rax, + scratch.rdi, + scratch.rsi, + scratch.rdx, + scratch.r10, + scratch.r8, + scratch.r9, + &mut token, + ); + (*stack).scratch.rax = ret; + } + + ptrace::breakpoint_callback(PTRACE_STOP_POST_SYSCALL, None, &mut token); + } +} + +#[unsafe(naked)] +pub unsafe extern "C" fn syscall_instruction() { + core::arch::naked_asm!( + // Yes, this is magic. No, you don't need to understand + "swapgs;", // Swap KGSBASE with GSBASE, allowing fast TSS access. + "mov gs:[{sp}], rsp;", // Save userspace stack pointer + "mov rsp, gs:[{ksp}];", // Load kernel stack pointer + "push QWORD PTR {ss_sel};", // Push fake userspace SS (resembling iret frame) + "push QWORD PTR gs:[{sp}];", // Push userspace rsp + "push r11;", // Push rflags + "push QWORD PTR {cs_sel};", // Push fake CS (resembling iret stack frame) + "push rcx;", // Push userspace return pointer + + // Push context registers + "push rax;", + push_scratch!(), + push_preserved!(), + + // TODO: Map PTI + // $crate::arch::x86_64::pti::map(); + + // Call inner funtion + "mov rdi, rsp;", + "call __inner_syscall_instruction;", + + // TODO: Unmap PTI + // $crate::arch::x86_64::pti::unmap(); + + " + .globl enter_usermode + enter_usermode: + ", + + // Pop context registers + pop_preserved!(), + pop_scratch!(), + + // Restore user GSBASE by swapping GSBASE and KGSBASE. + "swapgs;", + + // TODO: Should we unconditionally jump or avoid jumping, to hint to the branch predictor that + // singlestep is NOT set? + // + // It appears Intel CPUs assume (previously unknown) forward conditional branches to not be + // taken, and AMD appears to assume all previously unknown conditional branches will not be + // taken. + + // Check if the Trap Flag (singlestep flag) is set. If so, sysretq will return to before the + // instruction, whereas debuggers expect the iretq behavior of returning to after the + // instruction. + + "test BYTE PTR [rsp + 17], 1;", + // If set, return using IRETQ instead. + "jnz 2f;", + + // Otherwise, continue with the fast sysretq. + + // Pop userspace return pointer + "pop rcx;", + + // We must ensure RCX is canonical; if it is not when running sysretq, the consequences can be + // fatal from a security perspective. + // + // See https://xenproject.org/2012/06/13/the-intel-sysret-privilege-escalation/. + // + // This is not just theoretical; ptrace allows userspace to change RCX (via RIP) of target + // processes. + // + // While we could also conditionally IRETQ here, an easier method is to simply sign-extend RCX: + + // Shift away the upper 16 bits (0xBAAD_8000_DEAD_BEEF => 0x8000_DEAD_BEEF_XXXX). + "shl rcx, 16;", + // Shift arithmetically right by 16 bits, effectively extending the 47th sign bit to bits + // 63:48 (0x8000_DEAD_BEEF_XXXX => 0xFFFF_8000_DEAD_BEEF). + "sar rcx, 16;", + + "add rsp, 8;", // Pop fake userspace CS + "pop r11;", // Pop rflags + "pop rsp;", // Restore userspace stack pointer + "sysretq;", // Return into userspace; RCX=>RIP,R11=>RFLAGS + + // IRETQ fallback: + " + .p2align 4 +2: + xor rcx, rcx + xor r11, r11 + iretq + ", + + sp = const(offset_of!(gdt::ProcessorControlRegion, user_rsp_tmp)), + ksp = const(offset_of!(gdt::ProcessorControlRegion, tss) + offset_of!(TaskStateSegment, rsp)), + ss_sel = const(SegmentSelector::new(gdt::GDT_USER_DATA as u16, x86::Ring::Ring3).bits()), + cs_sel = const(SegmentSelector::new(gdt::GDT_USER_CODE as u16, x86::Ring::Ring3).bits()), + ); +} +unsafe extern "C" { + // TODO: macro? + pub fn enter_usermode(); +} diff --git a/src/arch/x86_64/macros.rs b/src/arch/x86_64/macros.rs new file mode 100644 index 0000000000..8d7e7b2e08 --- /dev/null +++ b/src/arch/x86_64/macros.rs @@ -0,0 +1,80 @@ +macro_rules! expand_bool( + ($value:expr_2021) => { + concat!($value) + } +); + +macro_rules! alternative( + (feature: $feature:literal, then: [$($then:expr_2021),*], default: [$($default:expr_2021),*]) => { + alternative2!(feature1: $feature, then1: [$($then),*], feature2: "", then2: [""], default: [$($default),*]) + } +); +macro_rules! saturating_sub( + ($lhs:literal, $rhs:literal) => { concat!( + "((", $lhs, ")>(", $rhs, "))*((", $lhs, ")-(", $rhs, "))", + ) } +); +// Use feature1 if present, otherwise try using feature2, otherwise use default. +// +// cpu_feature_always simply means it is always enabled. Thus, if feature2, which has lower +// priority, is "always" but feature1 is "auto", feature2 will still be checked for, and feature2 +// will become the fallback code. +// +// An empty string as feature is equivalent with "never". +macro_rules! alternative2( + (feature1: $feature1:literal, then1: [$($then1:expr_2021),*], feature2: $feature2:literal, then2: [$($then2:expr_2021),*], default: [$($default:expr_2021),*]) => { + concat!(" + .set true, 1 + .set false, 0 + 40: + .if ", expand_bool!(cfg!(cpu_feature_always = $feature1)), " + ", $($then1,)* " + .elseif ", expand_bool!(cfg!(cpu_feature_always = $feature2)), " + ", $($then2,)* " + .else + ", $($default,)* " + .endif + 42: + .if ", expand_bool!(cfg!(cpu_feature_auto = $feature1)), " + .skip -", saturating_sub!("51f - 50f", "42b - 40b"), ", 0x90 + .endif + .if ", expand_bool!(cfg!(cpu_feature_auto = $feature2)), " + .skip -", saturating_sub!("61f - 60f", "42b - 40b"), ", 0x90 + .endif + 41: + ", + // FIXME: The assembler apparently complains "invalid number of bytes" despite it being + // quite obvious what saturating_sub does. + + // Declare them in reverse order. Last relocation wins! + alternative_auto!("6", $feature2, [$($then2),*]), + alternative_auto!("5", $feature1, [$($then1),*]), + ) + }; +); +macro_rules! alternative_auto( + ($first_digit:literal, $feature:literal, [$($then:expr_2021),*]) => { concat!( + ".if ", expand_bool!(cfg!(cpu_feature_auto = $feature)), " + .pushsection .altcode.", $feature, ",\"a\" + ", $first_digit, "0: + ", $($then,)* " + ", $first_digit, "1: + .popsection + .pushsection .altfeatures.", $feature, ",\"a\" + 70: .ascii \"", $feature, "\" + 71: + .popsection + .pushsection .altrelocs.", $feature, ",\"a\" + .quad 70b + .quad 71b - 70b + .quad 40b + .quad 42b - 40b + .quad 41b - 40b + .quad 0 + .quad ", $first_digit, "0b + .quad ", $first_digit, "1b - ", $first_digit, "0b + .popsection + .endif + ", + ) } +); diff --git a/src/arch/x86_64/misc.rs b/src/arch/x86_64/misc.rs new file mode 100644 index 0000000000..91a9b71c73 --- /dev/null +++ b/src/arch/x86_64/misc.rs @@ -0,0 +1,29 @@ +use x86::controlregs::Cr4; + +use crate::{ + arch::cpuid::{cpuid, has_ext_feat}, + cpu_set::LogicalCpuId, +}; + +pub unsafe fn init(cpu_id: LogicalCpuId) { + unsafe { + if has_ext_feat(|feat| feat.has_umip()) { + // UMIP (UserMode Instruction Prevention) forbids userspace from calling SGDT, SIDT, SLDT, + // SMSW and STR. KASLR is currently not implemented, but this protects against leaking + // addresses. + x86::controlregs::cr4_write(x86::controlregs::cr4() | Cr4::CR4_ENABLE_UMIP); + } + if has_ext_feat(|feat| feat.has_smep()) { + // SMEP (Supervisor-Mode Execution Prevention) forbids the kernel from executing + // instruction on any page marked "userspace-accessible". This improves security for + // obvious reasons. + x86::controlregs::cr4_write(x86::controlregs::cr4() | Cr4::CR4_ENABLE_SMEP); + } + + if let Some(feats) = cpuid().get_extended_processor_and_feature_identifiers() + && feats.has_rdtscp() + { + x86::msr::wrmsr(x86::msr::IA32_TSC_AUX, cpu_id.get().into()); + } + } +} diff --git a/src/arch/x86_64/mod.rs b/src/arch/x86_64/mod.rs new file mode 100644 index 0000000000..513970014a --- /dev/null +++ b/src/arch/x86_64/mod.rs @@ -0,0 +1,50 @@ +pub use crate::arch::x86_shared::*; + +pub mod alternative; + +#[macro_use] +pub mod macros; + +/// Constants like memory locations +pub mod consts; + +/// Interrupt instructions +#[macro_use] +pub mod interrupt; + +/// Miscellaneous processor features +pub mod misc; + +// TODO: Maybe support rewriting relocations (using LD's --emit-relocs) when working with entire +// functions? +#[unsafe(naked)] +pub unsafe extern "C" fn arch_copy_to_user(dst: usize, src: usize, len: usize) -> u8 { + // TODO: spectre_v1 + + core::arch::naked_asm!( + ".global __usercopy_start + __usercopy_start:", + alternative!( + feature: "smap", + then: [" + xor eax, eax + mov rcx, rdx + stac + rep movsb + clac + ret + "], + default: [" + xor eax, eax + mov rcx, rdx + rep movsb + ret + "] + ), + ".global __usercopy_end + __usercopy_end:" + ); +} +pub use arch_copy_to_user as arch_copy_from_user; + +pub use alternative::kfx_size; diff --git a/src/arch/x86_shared/cpuid.rs b/src/arch/x86_shared/cpuid.rs new file mode 100644 index 0000000000..b36831252c --- /dev/null +++ b/src/arch/x86_shared/cpuid.rs @@ -0,0 +1,29 @@ +use raw_cpuid::{CpuId, CpuIdResult, ExtendedFeatures, FeatureInfo}; + +pub fn cpuid() -> CpuId { + // FIXME check for cpuid availability during early boot and error out if it doesn't exist. + CpuId::with_cpuid_fn(|a, c| { + #[cfg(target_arch = "x86")] + let result = unsafe { core::arch::x86::__cpuid_count(a, c) }; + #[cfg(target_arch = "x86_64")] + let result = unsafe { core::arch::x86_64::__cpuid_count(a, c) }; + CpuIdResult { + eax: result.eax, + ebx: result.ebx, + ecx: result.ecx, + edx: result.edx, + } + }) +} + +#[cfg_attr(not(target_arch = "x86_64"), expect(dead_code))] +pub fn feature_info() -> FeatureInfo { + cpuid() + .get_feature_info() + .expect("x86_64 requires CPUID leaf=0x01 to be present") +} + +#[cfg_attr(not(target_arch = "x86_64"), expect(dead_code))] +pub fn has_ext_feat(feat: impl FnOnce(ExtendedFeatures) -> bool) -> bool { + cpuid().get_extended_feature_info().is_some_and(feat) +} diff --git a/src/arch/x86_shared/debug.rs b/src/arch/x86_shared/debug.rs new file mode 100644 index 0000000000..ab87b0f758 --- /dev/null +++ b/src/arch/x86_shared/debug.rs @@ -0,0 +1,62 @@ +#[cfg(feature = "qemu_debug")] +use spin::Mutex; +use spin::MutexGuard; + +use crate::devices::serial::SerialKind; +#[cfg(feature = "lpss_debug")] +use crate::devices::uart_16550::SerialPort; +#[cfg(feature = "lpss_debug")] +use crate::syscall::io::Mmio; +#[cfg(feature = "qemu_debug")] +use crate::syscall::io::Pio; +#[cfg(feature = "qemu_debug")] +use syscall::io::Io; + +use super::device::serial::{COM1, LPSS}; +#[cfg(feature = "system76_ec_debug")] +use super::device::system76_ec::{System76Ec, SYSTEM76_EC}; + +#[cfg(feature = "qemu_debug")] +pub static QEMU: Mutex> = Mutex::new(Pio::::new(0x402)); + +pub struct Writer<'a> { + lpss: MutexGuard<'a, SerialKind>, + #[cfg(feature = "qemu_debug")] + qemu: MutexGuard<'a, Pio>, + serial: MutexGuard<'a, SerialKind>, + #[cfg(feature = "system76_ec_debug")] + system76_ec: MutexGuard<'a, Option>, +} + +impl<'a> Writer<'a> { + pub fn new() -> Writer<'a> { + Writer { + lpss: LPSS.lock(), + #[cfg(feature = "qemu_debug")] + qemu: QEMU.lock(), + serial: COM1.lock(), + #[cfg(feature = "system76_ec_debug")] + system76_ec: SYSTEM76_EC.lock(), + } + } + + pub fn write(&mut self, buf: &[u8]) { + self.lpss.write(buf); + + #[cfg(feature = "qemu_debug")] + { + for &b in buf { + self.qemu.write(b); + } + } + + self.serial.write(buf); + + #[cfg(feature = "system76_ec_debug")] + { + if let Some(ref mut system76_ec) = *self.system76_ec { + system76_ec.print_slice(buf); + } + } + } +} diff --git a/src/arch/x86_shared/device/cpu.rs b/src/arch/x86_shared/device/cpu.rs new file mode 100644 index 0000000000..c76d1a6048 --- /dev/null +++ b/src/arch/x86_shared/device/cpu.rs @@ -0,0 +1,281 @@ +use core::fmt::{Result, Write}; + +use crate::arch::cpuid::cpuid; + +pub fn cpu_info(w: &mut W) -> Result { + let cpuid = cpuid(); + + if let Some(info) = cpuid.get_vendor_info() { + writeln!(w, "Vendor: {}", info.as_str())?; + } + + if let Some(brand) = cpuid.get_processor_brand_string() { + writeln!(w, "Model: {}", brand.as_str())?; + } + + if let Some(info) = cpuid.get_processor_frequency_info() { + writeln!(w, "CPU Base MHz: {}", info.processor_base_frequency())?; + writeln!(w, "CPU Max MHz: {}", info.processor_max_frequency())?; + writeln!(w, "Bus MHz: {}", info.bus_frequency())?; + } + + write!(w, "Features:")?; + + if let Some(info) = cpuid.get_feature_info() { + if info.has_fpu() { + write!(w, " fpu")? + }; + if info.has_vme() { + write!(w, " vme")? + }; + if info.has_de() { + write!(w, " de")? + }; + if info.has_pse() { + write!(w, " pse")? + }; + if info.has_tsc() { + write!(w, " tsc")? + }; + if info.has_msr() { + write!(w, " msr")? + }; + if info.has_pae() { + write!(w, " pae")? + }; + if info.has_mce() { + write!(w, " mce")? + }; + + if info.has_cmpxchg8b() { + write!(w, " cx8")? + }; + if info.has_apic() { + write!(w, " apic")? + }; + if info.has_sysenter_sysexit() { + write!(w, " sep")? + }; + if info.has_mtrr() { + write!(w, " mtrr")? + }; + if info.has_pge() { + write!(w, " pge")? + }; + if info.has_mca() { + write!(w, " mca")? + }; + if info.has_cmov() { + write!(w, " cmov")? + }; + if info.has_pat() { + write!(w, " pat")? + }; + + if info.has_pse36() { + write!(w, " pse36")? + }; + if info.has_psn() { + write!(w, " psn")? + }; + if info.has_clflush() { + write!(w, " clflush")? + }; + if info.has_ds() { + write!(w, " ds")? + }; + if info.has_acpi() { + write!(w, " acpi")? + }; + if info.has_mmx() { + write!(w, " mmx")? + }; + if info.has_fxsave_fxstor() { + write!(w, " fxsr")? + }; + if info.has_sse() { + write!(w, " sse")? + }; + + if info.has_sse2() { + write!(w, " sse2")? + }; + if info.has_ss() { + write!(w, " ss")? + }; + if info.has_htt() { + write!(w, " ht")? + }; + if info.has_tm() { + write!(w, " tm")? + }; + if info.has_pbe() { + write!(w, " pbe")? + }; + + if info.has_sse3() { + write!(w, " sse3")? + }; + if info.has_pclmulqdq() { + write!(w, " pclmulqdq")? + }; + if info.has_ds_area() { + write!(w, " dtes64")? + }; + if info.has_monitor_mwait() { + write!(w, " monitor")? + }; + if info.has_cpl() { + write!(w, " ds_cpl")? + }; + if info.has_vmx() { + write!(w, " vmx")? + }; + if info.has_smx() { + write!(w, " smx")? + }; + if info.has_eist() { + write!(w, " est")? + }; + + if info.has_tm2() { + write!(w, " tm2")? + }; + if info.has_ssse3() { + write!(w, " ssse3")? + }; + if info.has_cnxtid() { + write!(w, " cnxtid")? + }; + if info.has_fma() { + write!(w, " fma")? + }; + if info.has_cmpxchg16b() { + write!(w, " cx16")? + }; + if info.has_pdcm() { + write!(w, " pdcm")? + }; + if info.has_pcid() { + write!(w, " pcid")? + }; + if info.has_dca() { + write!(w, " dca")? + }; + + if info.has_sse41() { + write!(w, " sse4_1")? + }; + if info.has_sse42() { + write!(w, " sse4_2")? + }; + if info.has_x2apic() { + write!(w, " x2apic")? + }; + if info.has_movbe() { + write!(w, " movbe")? + }; + if info.has_popcnt() { + write!(w, " popcnt")? + }; + if info.has_tsc_deadline() { + write!(w, " tsc_deadline_timer")? + }; + if info.has_aesni() { + write!(w, " aes")? + }; + if info.has_xsave() { + write!(w, " xsave")? + }; + + if info.has_oxsave() { + write!(w, " xsaveopt")? + }; + if info.has_avx() { + write!(w, " avx")? + }; + if info.has_f16c() { + write!(w, " f16c")? + }; + if info.has_rdrand() { + write!(w, " rdrand")? + }; + } + + if let Some(info) = cpuid.get_extended_processor_and_feature_identifiers() { + if info.has_64bit_mode() { + write!(w, " lm")? + }; + if info.has_rdtscp() { + write!(w, " rdtscp")? + }; + if info.has_1gib_pages() { + write!(w, " pdpe1gb")? + }; + if info.has_execute_disable() { + write!(w, " nx")? + }; + if info.has_syscall_sysret() { + write!(w, " syscall")? + }; + if info.has_prefetchw() { + write!(w, " prefetchw")? + }; + if info.has_lzcnt() { + write!(w, " lzcnt")? + }; + if info.has_lahf_sahf() { + write!(w, " lahf_lm")? + }; + } + + if let Some(info) = cpuid.get_advanced_power_mgmt_info() + && info.has_invariant_tsc() + { + write!(w, " constant_tsc")? + }; + + if let Some(info) = cpuid.get_extended_feature_info() { + if info.has_fsgsbase() { + write!(w, " fsgsbase")? + }; + if info.has_tsc_adjust_msr() { + write!(w, " tsc_adjust")? + }; + if info.has_bmi1() { + write!(w, " bmi1")? + }; + if info.has_hle() { + write!(w, " hle")? + }; + if info.has_avx2() { + write!(w, " avx2")? + }; + if info.has_smep() { + write!(w, " smep")? + }; + if info.has_bmi2() { + write!(w, " bmi2")? + }; + if info.has_rep_movsb_stosb() { + write!(w, " erms")? + }; + if info.has_invpcid() { + write!(w, " invpcid")? + }; + if info.has_rtm() { + write!(w, " rtm")? + }; + //if info.has_qm() { write!(w, " qm")? }; + if info.has_fpu_cs_ds_deprecated() { + write!(w, " fpu_seg")? + }; + if info.has_mpx() { + write!(w, " mpx")? + }; + } + + writeln!(w)?; + + Ok(()) +} diff --git a/src/arch/x86_shared/device/hpet.rs b/src/arch/x86_shared/device/hpet.rs new file mode 100644 index 0000000000..62a2f69fb7 --- /dev/null +++ b/src/arch/x86_shared/device/hpet.rs @@ -0,0 +1,125 @@ +//! + +use super::pit; +use crate::acpi::hpet::Hpet; +use core::time::Duration; + +const LEG_RT_CNF: u64 = 2; +const ENABLE_CNF: u64 = 1; + +const TN_VAL_SET_CNF: u64 = 0x40; +const TN_TYPE_CNF: u64 = 0x08; +const TN_INT_ENB_CNF: u64 = 0x04; + +pub(crate) const CAPABILITY_OFFSET: usize = 0x00; +const GENERAL_CONFIG_OFFSET: usize = 0x10; +const GENERAL_INTERRUPT_OFFSET: usize = 0x20; +pub(crate) const MAIN_COUNTER_OFFSET: usize = 0xF0; +// const NUM_TIMER_CAP_MASK: u64 = 0x0f00; +const LEG_RT_CAP: u64 = 0x8000; +const T0_CONFIG_CAPABILITY_OFFSET: usize = 0x100; +pub(crate) const T0_COMPARATOR_OFFSET: usize = 0x108; + +const PER_INT_CAP: u64 = 0x10; + +pub unsafe fn init(hpet: &mut Hpet) -> bool { + unsafe { + debug!("HPET @ {:#x}", { hpet.base_address.address }); + debug_caps(hpet); + + trace!("HPET Before Init"); + debug_config(hpet); + + // Disable HPET + { + let mut config_word = hpet.read_u64(GENERAL_CONFIG_OFFSET); + config_word &= !(LEG_RT_CNF | ENABLE_CNF); + hpet.write_u64(GENERAL_CONFIG_OFFSET, config_word); + } + + let capability = hpet.read_u64(CAPABILITY_OFFSET); + if capability & LEG_RT_CAP == 0 { + warn!("HPET missing capability LEG_RT_CAP"); + return false; + } + + let period_fs = capability >> 32; + let divisor = (pit::RATE as u64 * 1_000_000) / period_fs; + + let t0_capabilities = hpet.read_u64(T0_CONFIG_CAPABILITY_OFFSET); + if t0_capabilities & PER_INT_CAP == 0 { + warn!("HPET T0 missing capability PER_INT_CAP"); + return false; + } + + let counter = hpet.read_u64(MAIN_COUNTER_OFFSET); + + let t0_config_word: u64 = TN_VAL_SET_CNF | TN_TYPE_CNF | TN_INT_ENB_CNF; + hpet.write_u64(T0_CONFIG_CAPABILITY_OFFSET, t0_config_word); + // set accumulator value + hpet.write_u64(T0_COMPARATOR_OFFSET, counter + divisor); + // set interval + hpet.write_u64(T0_COMPARATOR_OFFSET, divisor); + + // Enable interrupts from the HPET + { + let mut config_word: u64 = hpet.read_u64(GENERAL_CONFIG_OFFSET); + config_word |= LEG_RT_CNF | ENABLE_CNF; + hpet.write_u64(GENERAL_CONFIG_OFFSET, config_word); + } + + trace!("HPET After Init"); + debug_config(hpet); + + true + } +} + +unsafe fn debug_caps(hpet: &mut Hpet) { + unsafe { + let capability = hpet.read_u64(CAPABILITY_OFFSET); + trace!(" caps: {:#x}", capability); + trace!( + " clock period: {:?}", + Duration::from_nanos((capability >> 32) / 1_000_000) + ); + trace!( + " ID: {:#x} revision: {}", + (capability >> 16) as u16, + capability as u8 + ); + trace!( + " LEG_RT_CAP: {} COUNT_SIZE_CAP: {}", + capability & (1 << 15) == (1 << 15), + capability & (1 << 13) == (1 << 13) + ); + // The NUM_TIM_CAP field contains the index of the last timer. + // Add 1 to get the amount of timers. + trace!(" timers: {}", ((capability >> 8) as u8 & 0x1F) + 1); + + let t0_capabilities = hpet.read_u64(T0_CONFIG_CAPABILITY_OFFSET); + trace!( + " T0 interrupt routing: {:#x}", + (t0_capabilities >> 32) as u32 + ); + } +} + +unsafe fn debug_config(hpet: &mut Hpet) { + unsafe { + let config_word = hpet.read_u64(GENERAL_CONFIG_OFFSET); + trace!(" config: {:#x}", config_word); + + let interrupt_status = hpet.read_u64(GENERAL_INTERRUPT_OFFSET); + trace!(" interrupt status: {:#x}", interrupt_status); + + let counter = hpet.read_u64(MAIN_COUNTER_OFFSET); + trace!(" counter: {:#x}", counter); + + let t0_capabilities = hpet.read_u64(T0_CONFIG_CAPABILITY_OFFSET); + trace!(" T0 flags: {:#x}", t0_capabilities as u32); + + let t0_comparator = hpet.read_u64(T0_COMPARATOR_OFFSET); + trace!(" T0 comparator: {:#x}", t0_comparator); + } +} diff --git a/src/arch/x86_shared/device/ioapic.rs b/src/arch/x86_shared/device/ioapic.rs new file mode 100644 index 0000000000..fb66d3bf2b --- /dev/null +++ b/src/arch/x86_shared/device/ioapic.rs @@ -0,0 +1,427 @@ +use core::{cell::SyncUnsafeCell, fmt, ptr}; + +use alloc::vec::Vec; +use spin::Mutex; + +use super::{local_apic::ApicId, pic}; +use crate::{ + acpi::madt::{self, Madt, MadtEntry, MadtIntSrcOverride, MadtIoApic}, + arch::{cpuid::cpuid, interrupt::irq}, + memory::{map_device_memory, PhysicalAddress}, +}; + +pub struct IoApicRegs { + pointer: *const u32, +} +impl IoApicRegs { + fn ioregsel(&self) -> *const u32 { + self.pointer + } + fn iowin(&self) -> *const u32 { + // offset 0x10 + unsafe { self.pointer.offset(4) } + } + fn write_ioregsel(&mut self, value: u32) { + unsafe { ptr::write_volatile::(self.ioregsel().cast_mut(), value) } + } + fn read_iowin(&self) -> u32 { + unsafe { ptr::read_volatile::(self.iowin()) } + } + fn write_iowin(&mut self, value: u32) { + unsafe { ptr::write_volatile::(self.iowin().cast_mut(), value) } + } + fn read_reg(&mut self, reg: u8) -> u32 { + self.write_ioregsel(reg.into()); + self.read_iowin() + } + fn write_reg(&mut self, reg: u8, value: u32) { + self.write_ioregsel(reg.into()); + self.write_iowin(value); + } + pub fn read_ioapicid(&mut self) -> u32 { + self.read_reg(0x00) + } + pub fn read_ioapicver(&mut self) -> u32 { + self.read_reg(0x01) + } + pub fn read_ioredtbl(&mut self, idx: u8) -> u64 { + assert!(idx < 24); + let lo = self.read_reg(0x10 + idx * 2); + let hi = self.read_reg(0x10 + idx * 2 + 1); + + u64::from(lo) | (u64::from(hi) << 32) + } + pub fn write_ioredtbl(&mut self, idx: u8, value: u64) { + assert!(idx < 24); + + let lo = value as u32; + let hi = (value >> 32) as u32; + + self.write_reg(0x10 + idx * 2, lo); + self.write_reg(0x10 + idx * 2 + 1, hi); + } + + pub fn max_redirection_table_entries(&mut self) -> u8 { + let ver = self.read_ioapicver(); + ((ver & 0x00FF_0000) >> 16) as u8 + } + #[allow(dead_code)] + pub fn id(&mut self) -> u8 { + let id_reg = self.read_ioapicid(); + ((id_reg & 0x0F00_0000) >> 24) as u8 + } +} +pub struct IoApic { + regs: Mutex, + gsi_start: u32, + count: u8, +} +unsafe impl Send for IoApic {} +unsafe impl Sync for IoApic {} +impl IoApic { + #[allow(dead_code)] + pub fn new(regs_base: *const u32, gsi_start: u32) -> Self { + let mut regs = IoApicRegs { pointer: regs_base }; + let count = regs.max_redirection_table_entries(); + + Self { + regs: Mutex::new(regs), + gsi_start, + count, + } + } + /// Map an interrupt vector to a physical local APIC ID of a processor (thus physical mode). + #[allow(dead_code)] + pub fn map(&self, idx: u8, info: MapInfo) { + self.regs.lock().write_ioredtbl(idx, info.as_raw()) + } + pub fn set_mask(&self, gsi: u32, mask: bool) { + let idx = (gsi - self.gsi_start) as u8; + let mut guard = self.regs.lock(); + + let mut reg = guard.read_ioredtbl(idx); + reg &= !(1 << 16); + reg |= u64::from(mask) << 16; + guard.write_ioredtbl(idx, reg); + } +} + +#[repr(u8)] +#[derive(Clone, Copy, Debug)] +pub enum ApicTriggerMode { + Edge = 0, + Level = 1, +} +#[repr(u8)] +#[derive(Clone, Copy, Debug)] +pub enum ApicPolarity { + ActiveHigh = 0, + ActiveLow = 1, +} +#[repr(u8)] +#[derive(Clone, Copy, Debug)] +#[allow(unused)] +pub enum DestinationMode { + Physical = 0, + Logical = 1, +} +#[repr(u8)] +#[derive(Clone, Copy, Debug)] +#[allow(unused)] +pub enum DeliveryMode { + Fixed = 0b000, + LowestPriority = 0b001, + Smi = 0b010, + Nmi = 0b100, + Init = 0b101, + ExtInt = 0b111, +} + +#[derive(Clone, Copy, Debug)] +pub struct MapInfo { + pub dest: ApicId, + pub mask: bool, + pub trigger_mode: ApicTriggerMode, + pub polarity: ApicPolarity, + pub dest_mode: DestinationMode, + pub delivery_mode: DeliveryMode, + pub vector: u8, +} + +impl MapInfo { + pub fn as_raw(&self) -> u64 { + assert!(self.vector >= 0x20); + assert!(self.vector <= 0xFE); + + // TODO: Check for reserved fields. + + (u64::from(self.dest.get()) << 56) + | (u64::from(self.mask) << 16) + | ((self.trigger_mode as u64) << 15) + | ((self.polarity as u64) << 13) + | ((self.dest_mode as u64) << 11) + | ((self.delivery_mode as u64) << 8) + | u64::from(self.vector) + } +} + +impl fmt::Debug for IoApic { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + struct RedirTable<'a>(&'a Mutex); + + impl fmt::Debug for RedirTable<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut guard = self.0.lock(); + + let count = guard.max_redirection_table_entries(); + f.debug_list() + .entries((0..count).map(|i| guard.read_ioredtbl(i))) + .finish() + } + } + + f.debug_struct("IoApic") + .field("redir_table", &RedirTable(&self.regs)) + .field("gsi_start", &self.gsi_start) + .field("count", &self.count) + .finish() + } +} + +#[derive(Clone, Copy, Debug)] +pub enum TriggerMode { + ConformsToSpecs, + Edge, + Level, +} + +#[derive(Clone, Copy, Debug)] +pub enum Polarity { + ConformsToSpecs, + ActiveHigh, + ActiveLow, +} + +#[derive(Clone, Copy, Debug)] +pub struct Override { + bus_irq: u8, + gsi: u32, + + trigger_mode: TriggerMode, + polarity: Polarity, +} + +// static mut because only the AP initializes the I/O Apic, and when that is done, it's solely +// accessed immutably. +static IOAPICS: SyncUnsafeCell>> = SyncUnsafeCell::new(None); + +// static mut for the same reason as above +static SRC_OVERRIDES: SyncUnsafeCell>> = SyncUnsafeCell::new(None); + +pub fn ioapics() -> &'static [IoApic] { + unsafe { &*IOAPICS.get() } + .as_ref() + .map_or(&[], |vector| &vector[..]) +} +pub fn src_overrides() -> &'static [Override] { + unsafe { &*SRC_OVERRIDES.get() } + .as_ref() + .map_or(&[], |vector| &vector[..]) +} + +pub unsafe fn handle_ioapic(madt_ioapic: &'static MadtIoApic) { + unsafe { + // map the I/O APIC registers + let virt = map_device_memory(PhysicalAddress::new(madt_ioapic.address as usize), 4096); + + let ioapic_registers = virt.data() as *const u32; + let ioapic = IoApic::new(ioapic_registers, madt_ioapic.gsi_base); + + assert_eq!( + ioapic.regs.lock().id(), + madt_ioapic.id, + "mismatched ACPI MADT I/O APIC ID, and the ID reported by the I/O APIC" + ); + + (*IOAPICS.get()).get_or_insert_with(Vec::new).push(ioapic); + } +} +pub unsafe fn handle_src_override(src_override: &'static MadtIntSrcOverride) { + unsafe { + let flags = src_override.flags; + + let polarity_raw = (flags & 0x0003) as u8; + let trigger_mode_raw = ((flags & 0x000C) >> 2) as u8; + + let polarity = match polarity_raw { + 0b00 => Polarity::ConformsToSpecs, + 0b01 => Polarity::ActiveHigh, + 0b10 => return, // reserved + 0b11 => Polarity::ActiveLow, + + _ => unreachable!(), + }; + + let trigger_mode = match trigger_mode_raw { + 0b00 => TriggerMode::ConformsToSpecs, + 0b01 => TriggerMode::Edge, + 0b10 => return, // reserved + 0b11 => TriggerMode::Level, + _ => unreachable!(), + }; + + let over = Override { + bus_irq: src_override.irq_source, + gsi: src_override.gsi_base, + polarity, + trigger_mode, + }; + (*SRC_OVERRIDES.get()) + .get_or_insert_with(Vec::new) + .push(over); + } +} + +#[allow(dead_code)] +pub unsafe fn init() { + unsafe { + let bsp_apic_id = ApicId::new(u32::from( + cpuid().get_feature_info().unwrap().initial_local_apic_id(), + )); // TODO: remove unwraps + + // search the madt for all IOAPICs. + if cfg!(feature = "acpi") { + let madt: &'static Madt = match madt::madt() { + Some(m) => m, + // TODO: Parse MP tables too. + None => return, + }; + if madt.flags & madt::FLAG_PCAT != 0 { + pic::disable(); + } + + // find all I/O APICs (usually one). + + for entry in madt.iter() { + match entry { + MadtEntry::IoApic(ioapic) => handle_ioapic(ioapic), + MadtEntry::IntSrcOverride(src_override) => handle_src_override(src_override), + _ => (), + } + } + } + println!( + "I/O APICs: {:?}, overrides: {:?}", + ioapics(), + src_overrides() + ); + + // map the legacy PC-compatible IRQs (0-15) to 32-47, just like we did with 8259 PIC (if it + // wouldn't have been disabled due to this I/O APIC) + for legacy_irq in 0..=15 { + let (gsi, trigger_mode, polarity) = match get_override(legacy_irq) { + Some(over) => (over.gsi, over.trigger_mode, over.polarity), + None => { + if src_overrides() + .iter() + .any(|over| over.gsi == u32::from(legacy_irq) && over.bus_irq != legacy_irq) + && !src_overrides() + .iter() + .any(|over| over.bus_irq == legacy_irq) + { + // there's an IRQ conflict, making this legacy IRQ inaccessible. + continue; + } + ( + legacy_irq.into(), + TriggerMode::ConformsToSpecs, + Polarity::ConformsToSpecs, + ) + } + }; + let apic = match find_ioapic(gsi) { + Some(ioapic) => ioapic, + None => { + println!("Unable to find a suitable APIC for legacy IRQ {} (GSI {}). It will not be mapped.", legacy_irq, gsi); + continue; + } + }; + let redir_tbl_index = (gsi - apic.gsi_start) as u8; + + let map_info = MapInfo { + // only send to the BSP + dest: bsp_apic_id, + dest_mode: DestinationMode::Physical, + delivery_mode: DeliveryMode::Fixed, + mask: false, + polarity: match polarity { + Polarity::ActiveHigh => ApicPolarity::ActiveHigh, + Polarity::ActiveLow => ApicPolarity::ActiveLow, + Polarity::ConformsToSpecs => ApicPolarity::ActiveHigh, + }, + trigger_mode: match trigger_mode { + TriggerMode::Edge => ApicTriggerMode::Edge, + TriggerMode::Level => ApicTriggerMode::Level, + TriggerMode::ConformsToSpecs => ApicTriggerMode::Edge, + }, + vector: 32 + legacy_irq, + }; + apic.map(redir_tbl_index, map_info); + } + println!( + "I/O APICs: {:?}, overrides: {:?}", + ioapics(), + src_overrides() + ); + irq::set_irq_method(irq::IrqMethod::Apic); + + // tell the firmware that we're using APIC rather than the default 8259 PIC. + + // FIXME: With ACPI moved to userspace, we should instead allow userspace to check whether the + // IOAPIC has been initialized, and then subsequently let some ACPI driver call the AML from + // userspace. + + /*#[cfg(feature = "acpi")] + { + let method = { + let namespace_guard = crate::acpi::ACPI_TABLE.namespace.read(); + if let Some(value) = namespace_guard.as_ref().unwrap().get("\\_PIC") { + value.get_as_method().ok() + } else { + None + } + }; + if let Some(m) = method { + m.execute("\\_PIC".into(), vec!(crate::acpi::aml::AmlValue::Integer(1))); + } + }*/ + } +} +fn get_override(irq: u8) -> Option<&'static Override> { + src_overrides().iter().find(|over| over.bus_irq == irq) +} +fn resolve(irq: u8) -> u32 { + get_override(irq).map_or(u32::from(irq), |over| over.gsi) +} +fn find_ioapic(gsi: u32) -> Option<&'static IoApic> { + ioapics() + .iter() + .find(|apic| gsi >= apic.gsi_start && gsi < apic.gsi_start + u32::from(apic.count)) +} + +pub unsafe fn mask(irq: u8) { + let gsi = resolve(irq); + let apic = match find_ioapic(gsi) { + Some(a) => a, + None => return, + }; + apic.set_mask(gsi, true); +} +pub unsafe fn unmask(irq: u8) { + let gsi = resolve(irq); + let apic = match find_ioapic(gsi) { + Some(a) => a, + None => return, + }; + apic.set_mask(gsi, false); +} diff --git a/src/arch/x86_shared/device/local_apic.rs b/src/arch/x86_shared/device/local_apic.rs new file mode 100644 index 0000000000..b6afe02afe --- /dev/null +++ b/src/arch/x86_shared/device/local_apic.rs @@ -0,0 +1,272 @@ +use core::{ + cell::SyncUnsafeCell, + ptr::{read_volatile, write_volatile}, +}; +use x86::msr::*; + +use crate::{ + arch::{cpuid::cpuid, ipi::IpiKind}, + memory::{map_device_memory, PhysicalAddress}, + percpu::PercpuBlock, +}; + +#[derive(Clone, Copy, Debug)] +pub struct ApicId(u32); + +impl ApicId { + pub fn new(inner: u32) -> Self { + Self(inner) + } + + pub fn get(&self) -> u32 { + self.0 + } +} + +static LOCAL_APIC: SyncUnsafeCell = SyncUnsafeCell::new(LocalApic { + address: 0, + x2: false, +}); +pub unsafe fn the_local_apic() -> &'static mut LocalApic { + unsafe { &mut *LOCAL_APIC.get() } +} + +pub unsafe fn init() { + unsafe { + the_local_apic().init(); + } +} + +pub unsafe fn init_ap() { + unsafe { + the_local_apic().init_ap(); + } +} + +/// Local APIC +pub struct LocalApic { + pub address: usize, + pub x2: bool, +} + +impl LocalApic { + unsafe fn init(&mut self) { + unsafe { + let physaddr = PhysicalAddress::new(rdmsr(IA32_APIC_BASE) as usize & 0xFFFF_0000); + + self.x2 = cpuid() + .get_feature_info() + .is_some_and(|feature_info| feature_info.has_x2apic()); + + if !self.x2 { + debug!("Detected xAPIC at {:#x}", physaddr.data()); + self.address = map_device_memory(physaddr, 4096).data(); + } else { + debug!("Detected x2APIC"); + } + + self.init_ap(); + } + } + + unsafe fn init_ap(&mut self) { + unsafe { + if self.x2 { + wrmsr(IA32_APIC_BASE, rdmsr(IA32_APIC_BASE) | (1 << 10)); + wrmsr(IA32_X2APIC_SIVR, 0x100); + } else { + self.write(0xF0, 0x100); + } + self.setup_error_int(); + //self.setup_timer(); + + PercpuBlock::current() + .misc_arch_info + .apic_id_opt + .set(Some(self.id())); + } + } + + unsafe fn read(&self, reg: u32) -> u32 { + debug_assert!(!self.x2); + unsafe { read_volatile((self.address + reg as usize) as *const u32) } + } + + unsafe fn write(&mut self, reg: u32, value: u32) { + debug_assert!(!self.x2); + unsafe { + write_volatile((self.address + reg as usize) as *mut u32, value); + } + } + + pub fn id(&self) -> ApicId { + ApicId::new(if self.x2 { + unsafe { rdmsr(IA32_X2APIC_APICID) as u32 } + } else { + unsafe { self.read(0x20) } + }) + } + + pub fn version(&self) -> u32 { + if self.x2 { + unsafe { rdmsr(IA32_X2APIC_VERSION) as u32 } + } else { + unsafe { self.read(0x30) } + } + } + + pub fn icr(&self) -> u64 { + if self.x2 { + unsafe { rdmsr(IA32_X2APIC_ICR) } + } else { + unsafe { ((self.read(0x310) as u64) << 32) | self.read(0x300) as u64 } + } + } + + pub fn set_icr(&mut self, value: u64) { + if self.x2 { + unsafe { + wrmsr(IA32_X2APIC_ICR, value); + } + } else { + unsafe { + const PENDING: u32 = 1 << 12; + while self.read(0x300) & PENDING == PENDING { + core::hint::spin_loop(); + } + self.write(0x310, (value >> 32) as u32); + self.write(0x300, value as u32); + while self.read(0x300) & PENDING == PENDING { + core::hint::spin_loop(); + } + } + } + } + + pub fn ipi(&mut self, apic_id: ApicId, kind: IpiKind) { + let shift = if self.x2 { 32 } else { 56 }; + self.set_icr((u64::from(apic_id.get()) << shift) | 0x40 | kind as u64); + } + pub fn ipi_nmi(&mut self, apic_id: ApicId) { + let shift = if self.x2 { 32 } else { 56 }; + self.set_icr((u64::from(apic_id.get()) << shift) | (1 << 14) | (0b100 << 8)); + } + + pub unsafe fn eoi(&mut self) { + unsafe { + if self.x2 { + wrmsr(IA32_X2APIC_EOI, 0); + } else { + self.write(0xB0, 0); + } + } + } + /// Reads the Error Status Register. + pub unsafe fn esr(&mut self) -> u32 { + unsafe { + if self.x2 { + // update the ESR to the current state of the local apic. + wrmsr(IA32_X2APIC_ESR, 0); + // read the updated value + rdmsr(IA32_X2APIC_ESR) as u32 + } else { + self.write(0x280, 0); + self.read(0x280) + } + } + } + pub unsafe fn lvt_timer(&mut self) -> u32 { + unsafe { + if self.x2 { + rdmsr(IA32_X2APIC_LVT_TIMER) as u32 + } else { + self.read(0x320) + } + } + } + pub unsafe fn set_lvt_timer(&mut self, value: u32) { + unsafe { + if self.x2 { + wrmsr(IA32_X2APIC_LVT_TIMER, u64::from(value)); + } else { + self.write(0x320, value); + } + } + } + pub unsafe fn init_count(&mut self) -> u32 { + unsafe { + if self.x2 { + rdmsr(IA32_X2APIC_INIT_COUNT) as u32 + } else { + self.read(0x380) + } + } + } + pub unsafe fn set_init_count(&mut self, initial_count: u32) { + unsafe { + if self.x2 { + wrmsr(IA32_X2APIC_INIT_COUNT, u64::from(initial_count)); + } else { + self.write(0x380, initial_count); + } + } + } + pub unsafe fn cur_count(&mut self) -> u32 { + unsafe { + if self.x2 { + rdmsr(IA32_X2APIC_CUR_COUNT) as u32 + } else { + self.read(0x390) + } + } + } + pub unsafe fn div_conf(&mut self) -> u32 { + unsafe { + if self.x2 { + rdmsr(IA32_X2APIC_DIV_CONF) as u32 + } else { + self.read(0x3E0) + } + } + } + pub unsafe fn set_div_conf(&mut self, div_conf: u32) { + unsafe { + if self.x2 { + wrmsr(IA32_X2APIC_DIV_CONF, u64::from(div_conf)); + } else { + self.write(0x3E0, div_conf); + } + } + } + pub unsafe fn lvt_error(&mut self) -> u32 { + unsafe { + if self.x2 { + rdmsr(IA32_X2APIC_LVT_ERROR) as u32 + } else { + self.read(0x370) + } + } + } + pub unsafe fn set_lvt_error(&mut self, lvt_error: u32) { + unsafe { + if self.x2 { + wrmsr(IA32_X2APIC_LVT_ERROR, u64::from(lvt_error)); + } else { + self.write(0x370, lvt_error); + } + } + } + unsafe fn setup_error_int(&mut self) { + unsafe { + let vector = 49u32; + self.set_lvt_error(vector); + } + } +} + +#[repr(u8)] +pub enum LvtTimerMode { + OneShot = 0b00, + Periodic = 0b01, + TscDeadline = 0b10, +} diff --git a/src/arch/x86_shared/device/mod.rs b/src/arch/x86_shared/device/mod.rs new file mode 100644 index 0000000000..6f41770601 --- /dev/null +++ b/src/arch/x86_shared/device/mod.rs @@ -0,0 +1,94 @@ +use core::cell::Cell; + +pub mod cpu; +pub mod hpet; +pub mod ioapic; +pub mod local_apic; +pub mod pic; +pub mod pit; +pub mod serial; +#[cfg(feature = "system76_ec_debug")] +pub mod system76_ec; + +#[cfg(feature = "x86_kvm_pv")] +pub mod tsc; + +pub unsafe fn init() { + unsafe { + pic::init(); + local_apic::init(); + + // Run here for the side effect of printing if KVM was used to avoid interleaved logs. + tsc::get_kvm_support(); + } +} +pub unsafe fn init_after_acpi() { + // this will disable the IOAPIC if needed. + //ioapic::init(mapper); +} + +unsafe fn init_hpet() -> bool { + if cfg!(not(feature = "acpi")) { + return false; + } + + unsafe { + use crate::acpi::ACPI_TABLE; + match *ACPI_TABLE.hpet.write() { + Some(ref mut hpet) => { + if cfg!(target_arch = "x86") { + //TODO: fix HPET on i686 + warn!("HPET found but implemented on i686"); + return false; + } + hpet::init(hpet) + } + _ => false, + } + } +} + +pub unsafe fn init_noncore() { + unsafe { + debug!("Initializing system timer"); + + #[cfg(feature = "x86_kvm_pv")] + if tsc::init() { + debug!("TSC used as system clock source"); + } + + if init_hpet() { + debug!("HPET used as system timer"); + } else { + pit::init(); + debug!("PIT used as system timer"); + } + + debug!("Finished initializing devices"); + } +} + +pub unsafe fn init_ap() { + unsafe { + local_apic::init_ap(); + + #[cfg(feature = "x86_kvm_pv")] + tsc::init(); + } +} + +pub struct ArchPercpuMisc { + pub apic_id_opt: Cell>, + #[cfg(feature = "x86_kvm_pv")] + pub tsc_info: tsc::TscPercpu, +} + +impl ArchPercpuMisc { + pub const fn default() -> Self { + Self { + apic_id_opt: Cell::new(None), + #[cfg(feature = "x86_kvm_pv")] + tsc_info: tsc::TscPercpu::default(), + } + } +} diff --git a/src/arch/x86_shared/device/pic.rs b/src/arch/x86_shared/device/pic.rs new file mode 100644 index 0000000000..0cf54ba801 --- /dev/null +++ b/src/arch/x86_shared/device/pic.rs @@ -0,0 +1,98 @@ +use core::cell::SyncUnsafeCell; + +use crate::{ + arch::interrupt::irq, + syscall::io::{Io, Pio}, +}; + +static MASTER: SyncUnsafeCell = SyncUnsafeCell::new(Pic::new(0x20)); +static SLAVE: SyncUnsafeCell = SyncUnsafeCell::new(Pic::new(0xA0)); + +// SAFETY: must be main thread +pub unsafe fn master<'a>() -> &'a mut Pic { + unsafe { &mut *MASTER.get() } +} +// SAFETY: must be main thread +pub unsafe fn slave<'a>() -> &'a mut Pic { + unsafe { &mut *SLAVE.get() } +} + +pub unsafe fn init() { + unsafe { + let master = master(); + let slave = slave(); + + // Start initialization + master.cmd.write(0x11); + slave.cmd.write(0x11); + + // Set offsets + master.data.write(0x20); + slave.data.write(0x28); + + // Set up cascade + master.data.write(4); + slave.data.write(2); + + // Set up interrupt mode (1 is 8086/88 mode, 2 is auto EOI) + master.data.write(1); + slave.data.write(1); + + // Unmask interrupts + master.data.write(0); + slave.data.write(0); + + // Ack remaining interrupts + master.ack(); + slave.ack(); + + // probably already set to PIC, but double-check + irq::set_irq_method(irq::IrqMethod::Pic); + } +} + +pub unsafe fn disable() { + unsafe { + master().data.write(0xFF); + slave().data.write(0xFF); + } +} + +pub struct Pic { + cmd: Pio, + data: Pio, +} + +impl Pic { + pub const fn new(port: u16) -> Pic { + Pic { + cmd: Pio::new(port), + data: Pio::new(port + 1), + } + } + + pub fn ack(&mut self) { + self.cmd.write(0x20); + } + + pub fn mask_set(&mut self, irq: u8) { + assert!(irq < 8); + + let mut mask = self.data.read(); + mask |= 1 << irq; + self.data.write(mask); + } + + pub fn mask_clear(&mut self, irq: u8) { + assert!(irq < 8); + + let mut mask = self.data.read(); + mask &= !(1 << irq); + self.data.write(mask); + } + /// A bitmap of all currently servicing IRQs. Spurious IRQs will not have this bit set + pub fn isr(&mut self) -> u8 { + self.cmd.write(0x0A); + self.cmd.read() // note that cmd is read, rather than data + } +} diff --git a/src/arch/x86_shared/device/pit.rs b/src/arch/x86_shared/device/pit.rs new file mode 100644 index 0000000000..d8013ee58c --- /dev/null +++ b/src/arch/x86_shared/device/pit.rs @@ -0,0 +1,50 @@ +use core::cell::SyncUnsafeCell; + +use crate::syscall::io::{Io, Pio}; + +static CHAN0: SyncUnsafeCell> = SyncUnsafeCell::new(Pio::new(0x40)); +//pub static mut CHAN1: Pio = Pio::new(0x41); +//pub static mut CHAN2: Pio = Pio::new(0x42); +static COMMAND: SyncUnsafeCell> = SyncUnsafeCell::new(Pio::new(0x43)); + +// SAFETY: must be externally syncd +pub unsafe fn chan0<'a>() -> &'a mut Pio { + unsafe { &mut *CHAN0.get() } +} +// SAFETY: must be externally syncd +pub unsafe fn command<'a>() -> &'a mut Pio { + unsafe { &mut *COMMAND.get() } +} + +const SELECT_CHAN0: u8 = 0b00 << 6; +const ACCESS_LATCH: u8 = 0b00 << 4; +const ACCESS_LOHI: u8 = 0b11 << 4; +const MODE_2: u8 = 0b010 << 1; + +// 1 / (1.193182 MHz) = 838,095,110 femtoseconds ~= 838.095 ns +pub const PERIOD_FS: u128 = 838_095_110; + +// 4847 / (1.193182 MHz) = 4,062,247 ns ~= 4.1 ms or 246 Hz +pub const CHAN0_DIVISOR: u16 = 4847; + +// Calculated interrupt period in nanoseconds based on divisor and period +pub const RATE: u128 = (CHAN0_DIVISOR as u128 * PERIOD_FS) / 1_000_000; + +pub unsafe fn init() { + unsafe { + command().write(SELECT_CHAN0 | ACCESS_LOHI | MODE_2); + chan0().write(CHAN0_DIVISOR as u8); + chan0().write((CHAN0_DIVISOR >> 8) as u8); + } +} + +pub unsafe fn read() -> u16 { + unsafe { + command().write(SELECT_CHAN0 | ACCESS_LATCH); + let low = chan0().read(); + let high = chan0().read(); + let counter = ((high as u16) << 8) | (low as u16); + // Counter is inverted, subtract from CHAN0_DIVISOR + CHAN0_DIVISOR.saturating_sub(counter) + } +} diff --git a/src/arch/x86_shared/device/rtc.rs b/src/arch/x86_shared/device/rtc.rs new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/arch/x86_shared/device/serial.rs b/src/arch/x86_shared/device/serial.rs new file mode 100644 index 0000000000..67aa04c23a --- /dev/null +++ b/src/arch/x86_shared/device/serial.rs @@ -0,0 +1,48 @@ +use crate::{ + devices::{serial::SerialKind, uart_16550::SerialPort}, + memory::map_device_memory, + syscall::io::{Mmio, Pio}, +}; +use spin::Mutex; + +pub static COM1: Mutex = Mutex::new(SerialKind::NotPresent); +pub static COM2: Mutex = Mutex::new(SerialKind::NotPresent); + +pub static LPSS: Mutex = Mutex::new(SerialKind::NotPresent); + +pub unsafe fn init() { + #[cfg(feature = "system76_ec_debug")] + super::system76_ec::init(); + + if cfg!(not(feature = "serial_debug")) { + // FIXME remove serial_debug feature once ACPI SPCR is respected on UEFI boots. + return; + } + + let mut com1 = SerialPort::>::new(0x3F8); + if com1.init().is_ok() { + *COM1.lock() = SerialKind::Ns16550Pio(com1); + } + let mut com2 = SerialPort::>::new(0x2F8); + if com2.init().is_ok() { + *COM2.lock() = SerialKind::Ns16550Pio(com2); + } + + // FIXME remove explicit LPSS handling once ACPI SPCR is supported + if cfg!(not(feature = "lpss_debug")) { + return; + } + + let virt = unsafe { + map_device_memory( + // TODO: Make this configurable + crate::memory::PhysicalAddress::new(0xFE032000), + 4, + ) + }; + + let lpss = unsafe { SerialPort::>::new(virt.data()) }; + if lpss.init().is_ok() { + *LPSS.lock() = SerialKind::Ns16550u32(lpss); + } +} diff --git a/src/arch/x86_shared/device/system76_ec.rs b/src/arch/x86_shared/device/system76_ec.rs new file mode 100644 index 0000000000..dca4d5efe3 --- /dev/null +++ b/src/arch/x86_shared/device/system76_ec.rs @@ -0,0 +1,89 @@ +use spin::Mutex; +use syscall::io::{Io, Pio}; + +pub static SYSTEM76_EC: Mutex> = Mutex::new(None); + +pub fn init() { + *SYSTEM76_EC.lock() = System76Ec::new(); +} + +pub struct System76Ec { + base: u16, +} + +impl System76Ec { + pub fn new() -> Option { + let mut system76_ec = Self { base: 0x0E00 }; + if system76_ec.probe() { + Some(system76_ec) + } else { + None + } + } + + #[inline(always)] + pub fn read(&mut self, addr: u8) -> u8 { + Pio::::new(self.base + addr as u16).read() + } + + #[inline(always)] + pub fn write(&mut self, addr: u8, data: u8) { + Pio::::new(self.base + addr as u16).write(data) + } + + pub fn probe(&mut self) -> bool { + // Send probe command + self.write(0, 1); + + // Wait for response + let mut timeout = 1_000_000; + while timeout > 0 { + if self.read(0) == 0 { + break; + } + timeout -= 1; + } + if timeout == 0 { + return false; + } + + // Return false on command error + if self.read(1) != 0 { + return false; + } + + // Must receive 0x76, 0xEC as signature + self.read(2) == 0x76 && self.read(3) == 0xEC + } + + pub fn flush(&mut self) { + // Send command + self.write(0, 4); + + // TODO: timeout + while self.read(0) != 0 {} + + // Clear length + self.write(3, 0); + } + + pub fn print(&mut self, byte: u8) { + // Read length + let len = self.read(3); + // Write data at offset + self.write(len + 4, byte); + // Update length + self.write(3, len + 1); + + // If we hit the end of the buffer, or were given a newline, flush + if byte == b'\n' || len >= 128 { + self.flush(); + } + } + + pub fn print_slice(&mut self, bytes: &[u8]) { + for &byte in bytes { + self.print(byte); + } + } +} diff --git a/src/arch/x86_shared/device/tsc.rs b/src/arch/x86_shared/device/tsc.rs new file mode 100644 index 0000000000..bfcf7a68ae --- /dev/null +++ b/src/arch/x86_shared/device/tsc.rs @@ -0,0 +1,161 @@ +use core::{cell::Cell, ptr::addr_of}; + +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::__cpuid; + +#[cfg(target_arch = "x86")] +use core::arch::x86::__cpuid; + +use rmm::Arch; +use spin::Once; + +use crate::{memory::allocate_frame, percpu::PercpuBlock}; + +pub struct KvmSupport { + max_leaf: u32, + supp_feats: KvmFeatureBits, +} +bitflags! { + // https://www.kernel.org/doc/html/latest/virt/kvm/x86/cpuid.html + #[derive(Debug)] + struct KvmFeatureBits: u32 { + const CLOCKSOURCE = 1 << 0; + const CLOCKSOURCE2 = 1 << 3; + const CLOCKSOURCE_STABLE = 1 << 24; + } +} + +// https://www.kernel.org/doc/html/v5.9/virt/kvm/msr.html +#[repr(C, packed)] +#[derive(Clone, Copy, Debug)] +struct PvclockVcpuTimeInfo { + version: u32, + pad: u32, + tsc_timestamp: u64, + system_time: u64, + tsc_to_system_mul: u32, + tsc_shift: i8, + flags: u8, + _pad: [u8; 2], +} + +const MSR_KVM_SYSTEM_TIME_NEW: u32 = 0x4b564d01; +const MSR_KVM_WALL_CLOCK_NEW: u32 = 0x4b564d00; + +pub struct TscPercpu { + vcpu_page: Cell<*const PvclockVcpuTimeInfo>, + prev: Cell, +} +impl TscPercpu { + pub const fn default() -> Self { + Self { + vcpu_page: Cell::new(core::ptr::null()), + prev: Cell::new(0), + } + } +} + +pub fn monotonic_absolute() -> Option { + let inf = &PercpuBlock::current().misc_arch_info.tsc_info; + let ptr = inf.vcpu_page.get(); + if ptr.is_null() { + return None; + } + loop { + unsafe { + let cur_version = addr_of!((*ptr).version).read_volatile(); + if cur_version & 1 == 1 { + continue; + } + let elapsed_ticks = + x86::time::rdtsc().saturating_sub(addr_of!((*ptr).tsc_timestamp).read_volatile()); + let tsc_shift = addr_of!((*ptr).tsc_shift).read_volatile(); + let elapsed = if tsc_shift >= 0 { + elapsed_ticks.checked_shl(tsc_shift as u32).unwrap() + } else { + elapsed_ticks.checked_shr((-tsc_shift) as u32).unwrap() + }; + let system_time = addr_of!((*ptr).system_time).read_volatile(); + let tsc_to_system_mul = addr_of!((*ptr).tsc_to_system_mul).read_volatile(); + let new_version = addr_of!((*ptr).version).read_volatile(); + if new_version != cur_version || new_version & 1 == 1 { + continue; + } + let delta = (u128::from(elapsed) * u128::from(tsc_to_system_mul)) >> 32; + let time = u128::from(system_time) + delta; + let prev = inf.prev.replace(time); + if prev > time { + // TODO + error!("TSC wraparound ({prev} > {time})"); + return None; + } + assert!(prev <= time); + return Some(time); + } + } +} + +pub fn get_kvm_support() -> &'static Option { + static KVM_SUPPORT: Once> = Once::new(); + + KVM_SUPPORT.call_once(|| { + let res = unsafe { __cpuid(0x4000_0000) }; + if [res.ebx, res.ecx, res.edx].map(u32::to_le_bytes) != [*b"KVMK", *b"VMKV", *b"M\0\0\0"] { + return None; + } + let max_leaf = res.eax; + if max_leaf < 0x4000_0001 { + return None; + } + let res = unsafe { __cpuid(0x4000_0001) }; + + let supp_feats = KvmFeatureBits::from_bits_retain(res.eax); + + debug!("Detected KVM paravirtualization support, features {supp_feats:?}"); + + Some(KvmSupport { + max_leaf, + supp_feats, + }) + }) +} + +pub unsafe fn init() -> bool { + unsafe { + let cpuid = crate::arch::cpuid::cpuid(); + if !cpuid.get_feature_info().is_some_and(|f| f.has_tsc()) { + return false; + } + + let kvm_support = get_kvm_support(); + + if let Some(kvm_support) = kvm_support + && kvm_support + .supp_feats + .contains(KvmFeatureBits::CLOCKSOURCE2 | KvmFeatureBits::CLOCKSOURCE_STABLE) + { + let frame = allocate_frame().expect("failed to allocate timer page"); + x86::msr::wrmsr(MSR_KVM_SYSTEM_TIME_NEW, (frame.base().data() as u64) | 1); + let ptr = crate::memory::RmmA::phys_to_virt(frame.base()).data() + as *const PvclockVcpuTimeInfo; + PercpuBlock::current() + .misc_arch_info + .tsc_info + .vcpu_page + .set(ptr); + + /*let tsc_ghz = loop { + let val1 = ptr.read_volatile(); + let val2 = ptr.read_volatile(); + if val1.version & 1 == 1 || val2.version & 1 == 1 || val1.version != val2.version { + continue; + } + let val1 + break tsc_hz / 1_000_000_000; + };*/ + true + } else { + false + } + } +} diff --git a/src/arch/x86_shared/gdt.rs b/src/arch/x86_shared/gdt.rs new file mode 100644 index 0000000000..cad344f3c2 --- /dev/null +++ b/src/arch/x86_shared/gdt.rs @@ -0,0 +1,431 @@ +//! Global descriptor table + +use core::ptr; + +#[cfg(target_arch = "x86")] +use x86::bits32::task::TaskStateSegment; +#[cfg(target_arch = "x86_64")] +use x86::bits64::task::TaskStateSegment; +use x86::{ + dtables::{self, DescriptorTablePointer}, + segmentation::{self, Descriptor as SegmentDescriptor, SegmentSelector}, + task, Ring, +}; + +use crate::{ + cpu_set::LogicalCpuId, + memory::{RmmA, RmmArch, PAGE_SIZE}, + percpu::PercpuBlock, +}; + +pub const GDT_NULL: usize = 0; +pub const GDT_KERNEL_CODE: usize = 1; +pub const GDT_KERNEL_DATA: usize = 2; +#[cfg(target_arch = "x86")] +pub const GDT_KERNEL_PERCPU: usize = 3; +#[cfg(target_arch = "x86_64")] +pub const GDT_USER_CODE32_UNUSED: usize = 3; +pub const GDT_USER_DATA: usize = 4; +pub const GDT_USER_CODE: usize = 5; +#[cfg(target_arch = "x86")] +pub const GDT_USER_FS: usize = 6; +#[cfg(target_arch = "x86")] +pub const GDT_USER_GS: usize = 7; +#[cfg(target_arch = "x86")] +pub const GDT_TSS: usize = 8; +#[cfg(target_arch = "x86_64")] +pub const GDT_TSS: usize = 6; +#[cfg(target_arch = "x86_64")] +pub const GDT_TSS_HIGH: usize = 7; + +pub const GDT_A_PRESENT: u8 = 1 << 7; +pub const GDT_A_RING_0: u8 = 0 << 5; +pub const GDT_A_RING_1: u8 = 1 << 5; +pub const GDT_A_RING_2: u8 = 2 << 5; +pub const GDT_A_RING_3: u8 = 3 << 5; +pub const GDT_A_SYSTEM: u8 = 1 << 4; +pub const GDT_A_EXECUTABLE: u8 = 1 << 3; +pub const GDT_A_CONFORMING: u8 = 1 << 2; +pub const GDT_A_PRIVILEGE: u8 = 1 << 1; +pub const GDT_A_DIRTY: u8 = 1; + +pub const GDT_A_TSS_AVAIL: u8 = 0x9; +pub const GDT_A_TSS_BUSY: u8 = 0xB; + +pub const GDT_F_PAGE_SIZE: u8 = 1 << 7; +pub const GDT_F_PROTECTED_MODE: u8 = 1 << 6; +pub const GDT_F_LONG_MODE: u8 = 1 << 5; + +const IOBITMAP_SIZE: u32 = 65536 / 8; + +#[cfg(target_arch = "x86")] +const SEGMENT_LIMIT: u32 = 0xFFFFF; +#[cfg(target_arch = "x86_64")] +const SEGMENT_LIMIT: u32 = 0; + +#[cfg(target_arch = "x86")] +const SEGMENT_FLAGS: u8 = GDT_F_PAGE_SIZE | GDT_F_PROTECTED_MODE; +#[cfg(target_arch = "x86_64")] +const SEGMENT_FLAGS: u8 = GDT_F_LONG_MODE; + +#[cfg(target_arch = "x86")] +const SEGMENT_COUNT: usize = 9; +#[cfg(target_arch = "x86_64")] +const SEGMENT_COUNT: usize = 8; + +// Later copied into the actual GDT with various fields set. +const BASE_GDT: [GdtEntry; SEGMENT_COUNT] = [ + // Null + GdtEntry::new(0, 0, 0, 0), + // Kernel code + GdtEntry::new( + 0, + SEGMENT_LIMIT, + GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, + SEGMENT_FLAGS, + ), + // Kernel data + GdtEntry::new( + 0, + SEGMENT_LIMIT, + GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, + SEGMENT_FLAGS, + ), + // Kernel TLS + #[cfg(target_arch = "x86")] + GdtEntry::new( + 0, + SEGMENT_LIMIT, + GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, + SEGMENT_FLAGS, + ), + // Dummy 32-bit user code - apparently necessary for SYSRET. We restrict it to ring 0 anyway. + #[cfg(target_arch = "x86_64")] + GdtEntry::new( + 0, + 0, + GDT_A_PRESENT | GDT_A_RING_0 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, + GDT_F_PROTECTED_MODE, + ), + // User data + GdtEntry::new( + 0, + SEGMENT_LIMIT, + GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, + SEGMENT_FLAGS, + ), + // User code + GdtEntry::new( + 0, + SEGMENT_LIMIT, + GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_EXECUTABLE | GDT_A_PRIVILEGE, + SEGMENT_FLAGS, + ), + // User FS (for TLS) + #[cfg(target_arch = "x86")] + GdtEntry::new( + 0, + SEGMENT_LIMIT, + GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, + SEGMENT_FLAGS, + ), + // User GS (for TLS) + #[cfg(target_arch = "x86")] + GdtEntry::new( + 0, + SEGMENT_LIMIT, + GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_SYSTEM | GDT_A_PRIVILEGE, + SEGMENT_FLAGS, + ), + // TSS + GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_TSS_AVAIL, 0), + // TSS must be 16 bytes long, twice the normal size + #[cfg(target_arch = "x86_64")] + GdtEntry::new(0, 0, 0, 0), +]; + +#[repr(C, align(16))] +struct Align([u64; 2]); + +#[repr(C, align(4096))] +pub struct ProcessorControlRegion { + // TODO: When both KASLR and KPTI are implemented, the PCR may need to be split into two pages, + // such that "secret" kernel addresses are only stored in the protected half. + pub self_ref: *mut ProcessorControlRegion, + + pub user_rsp_tmp: usize, + // The GDT *must* be stored in the PCR! The paranoid interrupt handler, lacking a reliable way + // to correctly obtain GSBASE, uses SGDT to calculate the PCR offset. + pub gdt: [GdtEntry; SEGMENT_COUNT], + pub percpu: PercpuBlock, + _rsvd: Align, + pub tss: TaskStateSegment, + + // These two fields are read by the CPU, but not currently modified by the kernel. Instead, the + // kernel sets the `iomap_base` field in the TSS, to either point to this bitmap, or outside + // the TSS, in which case userspace is not granted port IO access. + pub _iobitmap: [u8; IOBITMAP_SIZE as usize], + pub _all_ones: u8, +} + +const _: () = { + if core::mem::offset_of!(ProcessorControlRegion, tss) % 16 != 0 { + panic!("PCR is incorrectly defined, TSS alignment is too small"); + } + if core::mem::offset_of!(ProcessorControlRegion, gdt) % 8 != 0 { + panic!("PCR is incorrectly defined, GDT alignment is too small"); + } +}; + +impl ProcessorControlRegion { + const fn new_partial_init(cpu_id: LogicalCpuId) -> Self { + Self { + self_ref: ptr::null_mut(), + user_rsp_tmp: 0, + gdt: BASE_GDT, + percpu: PercpuBlock::init(cpu_id), + _rsvd: Align([0; 2]), + tss: TaskStateSegment::new(), + _iobitmap: [0; IOBITMAP_SIZE as usize], + _all_ones: 0xFF, + } + } +} + +pub unsafe fn pcr() -> *mut ProcessorControlRegion { + unsafe { + // Primitive benchmarking of RDFSBASE and RDGSBASE in userspace, appears to indicate that + // obtaining FSBASE/GSBASE using mov gs:[gs_self_ref] is faster than using the (probably + // microcoded) instructions. + let mut ret: *mut ProcessorControlRegion; + core::arch::asm!("mov {}, gs:[{}]", out(reg) ret, const(core::mem::offset_of!(ProcessorControlRegion, self_ref))); + ret + } +} + +#[cfg(feature = "pti")] +pub unsafe fn set_tss_stack(pcr: *mut ProcessorControlRegion, stack: usize) { + use super::pti::{PTI_CONTEXT_STACK, PTI_CPU_STACK}; + + #[cfg(target_arch = "x86")] + unsafe { + core::ptr::addr_of_mut!((*pcr).tss.ss0).write((GDT_KERNEL_DATA << 3) as u16); + core::ptr::addr_of_mut!((*pcr).tss.esp0) + .write((PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len()) as u32); + } + + #[cfg(target_arch = "x86_64")] + unsafe { + core::ptr::addr_of_mut!((*pcr).tss.rsp[0]) + .write_unaligned((PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len()) as u64); + } + + unsafe { PTI_CONTEXT_STACK = stack }; +} + +#[cfg(not(feature = "pti"))] +pub unsafe fn set_tss_stack(pcr: *mut ProcessorControlRegion, stack: usize) { + #[cfg(target_arch = "x86")] + unsafe { + core::ptr::addr_of_mut!((*pcr).tss.ss0).write((GDT_KERNEL_DATA << 3) as u16); + core::ptr::addr_of_mut!((*pcr).tss.esp0).write(stack as u32); + } + + #[cfg(target_arch = "x86_64")] + unsafe { + // TODO: If this increases performance, read gs:[offset] directly + core::ptr::addr_of_mut!((*pcr).tss.rsp[0]).write_unaligned(stack as u64); + } +} + +pub unsafe fn set_userspace_io_allowed(pcr: *mut ProcessorControlRegion, allowed: bool) { + let offset = if allowed { + u16::try_from(size_of::()).expect("guaranteed to fit in u16") + } else { + 0xFFFF + }; + + unsafe { + #[cfg(target_arch = "x86")] + core::ptr::addr_of_mut!((*pcr).tss.iobp_offset).write(offset); + + #[cfg(target_arch = "x86_64")] + core::ptr::addr_of_mut!((*pcr).tss.iomap_base).write(offset); + } +} + +#[cold] +fn init_pcr(pcr: &mut ProcessorControlRegion, stack_end: usize) { + pcr.self_ref = pcr as *mut _; + + // Setup the GDT. + #[cfg(target_arch = "x86")] + pcr.gdt[GDT_KERNEL_PERCPU].set_offset(pcr as *const _ as u32); + + #[cfg(target_arch = "x86")] + { + pcr.tss.iobp_offset = 0xFFFF; + let tss = &pcr.tss as *const _ as usize as u32; + + pcr.gdt[GDT_TSS].set_offset(tss); + pcr.gdt[GDT_TSS].set_limit(size_of::() as u32 + IOBITMAP_SIZE as u32); + } + + #[cfg(target_arch = "x86_64")] + { + pcr.tss.iomap_base = 0xFFFF; + + let tss = &mut pcr.tss as *mut TaskStateSegment as usize as u64; + let tss_lo = (tss & 0xFFFF_FFFF) as u32; + let tss_hi = (tss >> 32) as u32; + + pcr.gdt[GDT_TSS].set_offset(tss_lo); + pcr.gdt[GDT_TSS].set_limit(size_of::() as u32 + IOBITMAP_SIZE); + + // GDT is aligned to 8 bytes + #[expect(clippy::cast_ptr_alignment)] + unsafe { + (&mut pcr.gdt[GDT_TSS_HIGH] as *mut GdtEntry) + .cast::() + .write(tss_hi); + } + } + + // Set the stack pointer to use when coming back from userspace. + unsafe { + set_tss_stack(pcr, stack_end); + } +} + +#[cold] +pub unsafe fn install_pcr(pcr_ptr: *mut ProcessorControlRegion) { + let pcr = unsafe { &mut *pcr_ptr }; + + let gdtr: DescriptorTablePointer = DescriptorTablePointer { + limit: const { (SEGMENT_COUNT * size_of::() - 1) as u16 }, + base: pcr.gdt.as_ptr() as *const SegmentDescriptor, + }; + + // Load the new GDT, which is correctly located in thread local storage. + unsafe { dtables::lgdt(&gdtr) }; + + #[cfg(target_arch = "x86")] + unsafe { + // Reload the segment descriptors + segmentation::load_cs(SegmentSelector::new(GDT_KERNEL_CODE as u16, Ring::Ring0)); + segmentation::load_ds(SegmentSelector::new(GDT_USER_DATA as u16, Ring::Ring3)); + segmentation::load_es(SegmentSelector::new(GDT_USER_DATA as u16, Ring::Ring3)); + segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); + + // TODO: Use FS for kernel percpu on i686? + segmentation::load_fs(SegmentSelector::new(GDT_USER_FS as u16, Ring::Ring0)); + segmentation::load_gs(SegmentSelector::new(GDT_KERNEL_PERCPU as u16, Ring::Ring0)); + } + + #[cfg(target_arch = "x86_64")] + unsafe { + // Load segments again, possibly resetting FSBASE and GSBASE. + segmentation::load_cs(SegmentSelector::new(GDT_KERNEL_CODE as u16, Ring::Ring0)); + segmentation::load_ss(SegmentSelector::new(GDT_KERNEL_DATA as u16, Ring::Ring0)); + + segmentation::load_ds(SegmentSelector::from_raw(0)); + segmentation::load_es(SegmentSelector::from_raw(0)); + segmentation::load_fs(SegmentSelector::from_raw(0)); + + // What happens when GS is loaded with a NULL selector, is undefined on Intel CPUs. However, + // GSBASE is set later. + segmentation::load_gs(SegmentSelector::from_raw(0)); + + // Ensure that GSBASE always points to the PCR in kernel space. + x86::msr::wrmsr(x86::msr::IA32_GS_BASE, pcr as *mut _ as usize as u64); + + // While GSBASE points to the PCR in kernel space, userspace is free to set it to other values. + // Zero-initialize userspace's GSBASE. The reason the GSBASE register writes are reversed, is + // because entering usermode will entail executing the SWAPGS instruction. + x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0); + + // Set the userspace FSBASE to zero. + x86::msr::wrmsr(x86::msr::IA32_FS_BASE, 0); + } + + // Load the task register + unsafe { task::load_tr(SegmentSelector::new(GDT_TSS as u16, Ring::Ring0)) }; + + unsafe { crate::percpu::init_tlb_shootdown(pcr.percpu.cpu_id, &mut pcr.percpu) }; +} + +/// Initialize GDT and configure percpu for the BSP. +#[cold] +pub unsafe fn init_bsp(stack_end: usize) { + static mut BSP_PCR: ProcessorControlRegion = + ProcessorControlRegion::new_partial_init(LogicalCpuId::BSP); + + init_pcr(unsafe { &mut *ptr::addr_of_mut!(BSP_PCR) }, stack_end); + + unsafe { install_pcr(ptr::addr_of_mut!(BSP_PCR)) }; +} + +#[cold] +pub fn allocate_and_init_pcr( + cpu_id: LogicalCpuId, + stack_end: usize, +) -> *mut ProcessorControlRegion { + let alloc_order = size_of::() + .div_ceil(PAGE_SIZE) + .next_power_of_two() + .trailing_zeros(); + + let pcr_frame = crate::memory::allocate_p2frame(alloc_order).expect("failed to allocate PCR"); + let pcr_ptr = RmmA::phys_to_virt(pcr_frame.base()).data() as *mut ProcessorControlRegion; + unsafe { core::ptr::write(pcr_ptr, ProcessorControlRegion::new_partial_init(cpu_id)) }; + + init_pcr(unsafe { &mut *pcr_ptr }, stack_end); + + pcr_ptr +} + +#[derive(Copy, Clone, Debug)] +#[repr(C, packed)] +pub struct GdtEntry { + pub limitl: u16, + pub offsetl: u16, + pub offsetm: u8, + pub access: u8, + pub flags_limith: u8, + pub offseth: u8, +} + +impl GdtEntry { + pub const fn new(offset: u32, limit: u32, access: u8, flags: u8) -> Self { + GdtEntry { + limitl: limit as u16, + offsetl: offset as u16, + offsetm: (offset >> 16) as u8, + access, + flags_limith: flags & 0xF0 | ((limit >> 16) as u8) & 0x0F, + offseth: (offset >> 24) as u8, + } + } + + #[cfg(target_arch = "x86")] + pub const fn offset(&self) -> u32 { + (self.offsetl as u32) | ((self.offsetm as u32) << 16) | ((self.offseth as u32) << 24) + } + + pub const fn set_offset(&mut self, offset: u32) { + self.offsetl = offset as u16; + self.offsetm = (offset >> 16) as u8; + self.offseth = (offset >> 24) as u8; + } + + pub const fn set_limit(&mut self, limit: u32) { + self.limitl = limit as u16; + self.flags_limith = self.flags_limith & 0xF0 | ((limit >> 16) as u8) & 0x0F; + } +} + +impl PercpuBlock { + pub fn current() -> &'static Self { + unsafe { &*core::ptr::addr_of!((*pcr()).percpu) } + } +} diff --git a/src/arch/x86_shared/idt.rs b/src/arch/x86_shared/idt.rs new file mode 100644 index 0000000000..500645855d --- /dev/null +++ b/src/arch/x86_shared/idt.rs @@ -0,0 +1,361 @@ +use core::{ + cell::SyncUnsafeCell, + mem, + sync::atomic::{AtomicU32, Ordering}, +}; + +use alloc::boxed::Box; +use hashbrown::{hash_map::DefaultHashBuilder, HashMap}; + +use x86::{ + dtables::{self, DescriptorTablePointer}, + segmentation::Descriptor as X86IdtEntry, +}; + +use crate::{ + arch::{ + interrupt::{ + irq::{__generic_interrupts_end, __generic_interrupts_start}, + *, + }, + ipi::IpiKind, + }, + cpu_set::LogicalCpuId, + memory::PAGE_SIZE, +}; + +use spin::RwLock; + +#[repr(C)] +pub struct Idt { + pub(crate) entries: [IdtEntry; 256], + reservations: [AtomicU32; 8], + backup_stack_end: usize, +} + +impl Idt { + const fn new() -> Self { + Self { + entries: [IdtEntry::new(); 256], + reservations: [const { AtomicU32::new(0) }; 8], + backup_stack_end: 0, + } + } + + #[inline] + fn is_reserved(&self, index: u8) -> bool { + let byte_index = index / 32; + let bit = index % 32; + + self.reservations[usize::from(byte_index)].load(Ordering::Acquire) & (1 << bit) != 0 + } + + #[inline] + pub(crate) fn set_reserved(&self, index: u8, reserved: bool) { + let byte_index = index / 32; + let bit = index % 32; + + self.reservations[usize::from(byte_index)] + .fetch_or(u32::from(reserved) << bit, Ordering::AcqRel); + } + + #[inline] + pub(crate) fn set_reserved_mut(&mut self, index: u8, reserved: bool) { + let byte_index = index / 32; + let bit = index % 32; + + *{ &mut self.reservations[usize::from(byte_index)] }.get_mut() |= + u32::from(reserved) << bit; + } +} + +// Allocate 64 KiB of stack space for the backup stack. +const BACKUP_STACK_SIZE: usize = PAGE_SIZE << 4; + +static INIT_BSP_IDT: SyncUnsafeCell = SyncUnsafeCell::new(Idt::new()); + +// TODO: VecMap? +pub(crate) static IDTS: RwLock> = + RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); + +#[inline] +pub fn is_reserved(cpu_id: LogicalCpuId, index: u8) -> bool { + if cpu_id == LogicalCpuId::BSP { + return unsafe { (*INIT_BSP_IDT.get()).is_reserved(index) }; + } + + IDTS.read().get(&cpu_id).unwrap().is_reserved(index) +} + +#[inline] +pub fn set_reserved(cpu_id: LogicalCpuId, index: u8, reserved: bool) { + if cpu_id == LogicalCpuId::BSP { + unsafe { (*INIT_BSP_IDT.get()).set_reserved(index, reserved) }; + return; + } + + IDTS.read() + .get(&cpu_id) + .unwrap() + .set_reserved(index, reserved); +} + +pub fn available_irqs_iter(cpu_id: LogicalCpuId) -> impl Iterator + 'static { + (32..=254).filter(move |&index| !is_reserved(cpu_id, index)) +} + +fn set_exceptions(idt: &mut [IdtEntry]) { + // Set up exceptions + idt[0].set_func(exception::divide_by_zero); + idt[1].set_func(exception::debug); + idt[2].set_func(exception::non_maskable); + idt[3].set_func(exception::breakpoint); + idt[3].set_flags(IdtFlags::PRESENT | IdtFlags::RING_3 | IdtFlags::INTERRUPT); + idt[4].set_func(exception::overflow); + idt[5].set_func(exception::bound_range); + idt[6].set_func(exception::invalid_opcode); + idt[7].set_func(exception::device_not_available); + idt[8].set_func(exception::double_fault); + // 9 no longer available + idt[10].set_func(exception::invalid_tss); + idt[11].set_func(exception::segment_not_present); + idt[12].set_func(exception::stack_segment); + idt[13].set_func(exception::protection); + idt[14].set_func(exception::page); + // 15 reserved + idt[16].set_func(exception::fpu_fault); + idt[17].set_func(exception::alignment_check); + idt[18].set_func(exception::machine_check); + idt[19].set_func(exception::simd); + idt[20].set_func(exception::virtualization); + // 21 through 29 reserved + idt[30].set_func(exception::security); + // 31 reserved +} + +/// Initializes a fully functional IDT for use before it be moved into the map. This is ONLY called +/// on the BSP, since the kernel heap is ready for the APs. +pub unsafe fn init_bsp() { + #[repr(C, packed(4096))] + struct BackupStack([u8; BACKUP_STACK_SIZE]); + + static INIT_BSP_BACKUP_STACK: SyncUnsafeCell = + SyncUnsafeCell::new(BackupStack([0; BACKUP_STACK_SIZE])); + + unsafe { + init_generic( + LogicalCpuId::BSP, + &mut *INIT_BSP_IDT.get(), + INIT_BSP_BACKUP_STACK.get().addr() + BACKUP_STACK_SIZE, + ); + + install_idt(&mut *INIT_BSP_IDT.get()); + } +} + +pub fn allocate_and_init_idt(cpu_id: LogicalCpuId) -> *mut Idt { + let mut idts_btree = IDTS.write(); + + let idt = idts_btree + .entry(cpu_id) + .or_insert_with(|| Box::leak(Box::new(Idt::new()))); + + use crate::memory::{RmmA, RmmArch}; + let frames = crate::memory::allocate_p2frame(4) + .expect("failed to allocate pages for backup interrupt stack"); + + // Physical pages are mapped linearly. So is the linearly mapped virtual memory. + let base_address = RmmA::phys_to_virt(frames.base()); + + // Stack always grows downwards. + let backup_stack_end = base_address.data() + BACKUP_STACK_SIZE; + + init_generic(cpu_id, idt, backup_stack_end); + + *idt +} + +const BACKUP_IST: u8 = 1; + +/// Initializes an IDT for any type of processor. +fn init_generic(cpu_id: LogicalCpuId, idt: &mut Idt, backup_stack_end: usize) { + let (current_idt, current_reservations) = (&mut idt.entries, &mut idt.reservations); + + set_exceptions(current_idt); + + // We give Non-Maskable Interrupts, Double Fault, and Machine Check exceptions separate + // stacks, since these (unless we are going to set up NMI watchdogs like Linux does) are + // considered the most fatal, especially Double Faults which are caused by errors __when + // accessing the system IDT__. If that goes wrong, then kernel memory may be partially + // corrupt, and we want a separate stack. + // + // Note that each CPU has its own "backup interrupt stack". + idt.backup_stack_end = backup_stack_end; + current_idt[2].set_ist(BACKUP_IST); + current_idt[8].set_ist(BACKUP_IST); + current_idt[18].set_ist(BACKUP_IST); + + assert_eq!( + __generic_interrupts_end as usize - __generic_interrupts_start as usize, + 224 * 8 + ); + + for i in 0..224 { + current_idt[i + 32].set_func(unsafe { + mem::transmute::( + __generic_interrupts_start as usize + i * 8, + ) + }); + } + + // reserve bits 31:0, i.e. the first 32 interrupts, which are reserved for exceptions + *current_reservations[0].get_mut() |= 0x0000_0000_FFFF_FFFF; + + if cpu_id == LogicalCpuId::BSP { + // Set up IRQs + current_idt[32].set_func(irq::pit_stack); + current_idt[33].set_func(irq::keyboard); + current_idt[34].set_func(irq::cascade); + current_idt[35].set_func(irq::com2); + current_idt[36].set_func(irq::com1); + current_idt[37].set_func(irq::lpt2); + current_idt[38].set_func(irq::floppy); + current_idt[39].set_func(irq::lpt1); + current_idt[40].set_func(irq::rtc); + current_idt[41].set_func(irq::pci1); + current_idt[42].set_func(irq::pci2); + current_idt[43].set_func(irq::pci3); + current_idt[44].set_func(irq::mouse); + current_idt[45].set_func(irq::fpu); + current_idt[46].set_func(irq::ata1); + current_idt[47].set_func(irq::ata2); + current_idt[48].set_func(irq::lapic_timer); + current_idt[49].set_func(irq::lapic_error); + + // reserve bits 49:32, which are for the standard IRQs, and for the local apic timer and error. + *current_reservations[1].get_mut() |= 0x0003_FFFF; + } else { + // TODO: use_default_irqs! but also the legacy IRQs that are only needed on one CPU + current_idt[49].set_func(irq::lapic_error); + + // reserve bit 49 + *current_reservations[1].get_mut() |= 1 << 17; + } + + // Set IPI handlers + current_idt[IpiKind::Wakeup as usize].set_func(ipi::wakeup); + current_idt[IpiKind::Switch as usize].set_func(ipi::switch); + current_idt[IpiKind::Tlb as usize].set_func(ipi::tlb); + current_idt[IpiKind::Pit as usize].set_func(ipi::pit); + idt.set_reserved_mut(IpiKind::Wakeup as u8, true); + idt.set_reserved_mut(IpiKind::Switch as u8, true); + idt.set_reserved_mut(IpiKind::Tlb as u8, true); + idt.set_reserved_mut(IpiKind::Pit as u8, true); + + #[cfg(target_arch = "x86")] + { + let current_idt = &mut idt.entries; + // Set syscall function + current_idt[0x80].set_func(syscall::syscall); + current_idt[0x80].set_flags(IdtFlags::PRESENT | IdtFlags::RING_3 | IdtFlags::INTERRUPT); + idt.set_reserved_mut(0x80, true); + } + + #[cfg(feature = "profiling")] + crate::profiling::maybe_setup_timer(idt, cpu_id); +} + +pub unsafe fn install_idt(idt_ptr: *mut Idt) { + unsafe { + let idt = &mut *idt_ptr; + + #[cfg(target_arch = "x86_64")] // TODO: x86 + { + (*crate::arch::gdt::pcr()).tss.ist[usize::from(BACKUP_IST - 1)] = + idt.backup_stack_end as u64; + } + + let idtr: DescriptorTablePointer = DescriptorTablePointer { + limit: (idt.entries.len() * size_of::() - 1) as u16, + base: idt.entries.as_ptr() as *const X86IdtEntry, + }; + + dtables::lidt(&idtr); + } +} + +bitflags! { + pub struct IdtFlags: u8 { + const PRESENT = 1 << 7; + const RING_0 = 0 << 5; + const RING_1 = 1 << 5; + const RING_2 = 2 << 5; + const RING_3 = 3 << 5; + const SS = 1 << 4; + const INTERRUPT = 0xE; + const TRAP = 0xF; + } +} + +#[derive(Copy, Clone, Debug, Default)] +#[repr(C, packed)] +pub struct IdtEntry { + offsetl: u16, + selector: u16, + zero: u8, + attribute: u8, + offsetm: u16, + #[cfg(target_arch = "x86_64")] + offseth: u32, + #[cfg(target_arch = "x86_64")] + _zero2: u32, +} + +impl IdtEntry { + pub const fn new() -> IdtEntry { + IdtEntry { + offsetl: 0, + selector: 0, + zero: 0, + attribute: 0, + offsetm: 0, + #[cfg(target_arch = "x86_64")] + offseth: 0, + #[cfg(target_arch = "x86_64")] + _zero2: 0, + } + } + + pub fn set_flags(&mut self, flags: IdtFlags) { + self.attribute = flags.bits(); + } + + pub fn set_ist(&mut self, ist: u8) { + assert_eq!( + ist & 0x07, + ist, + "interrupt stack table must be within 0..=7" + ); + self.zero &= 0xF8; + self.zero |= ist; + } + + pub fn set_offset(&mut self, selector: u16, base: usize) { + self.selector = selector; + self.offsetl = base as u16; + self.offsetm = (base >> 16) as u16; + #[cfg(target_arch = "x86_64")] + { + self.offseth = ((base as u64) >> 32) as u32; + } + } + + // A function to set the offset more easily + pub fn set_func(&mut self, func: unsafe extern "C" fn()) { + self.set_flags(IdtFlags::PRESENT | IdtFlags::RING_0 | IdtFlags::INTERRUPT); + self.set_offset( + (crate::arch::gdt::GDT_KERNEL_CODE as u16) << 3, + func as usize, + ); + } +} diff --git a/src/arch/x86_shared/interrupt/exception.rs b/src/arch/x86_shared/interrupt/exception.rs new file mode 100644 index 0000000000..7725a45d0a --- /dev/null +++ b/src/arch/x86_shared/interrupt/exception.rs @@ -0,0 +1,294 @@ +use syscall::Exception; +use x86::irq::PageFaultError; + +use crate::{ + arch::x86_shared::interrupt, + context::signal::excp_handler, + memory::{GenericPfFlags, VirtualAddress}, + ptrace, + sync::CleanLockToken, + syscall::flag::*, +}; + +interrupt_stack!(divide_by_zero, |stack| { + println!("Divide by zero"); + stack.trace(); + excp_handler(Exception { + kind: 0, + ..Default::default() + }); +}); + +interrupt_stack!(debug, @paranoid, |stack| { + let mut handled = false; + + // Disable singlestep before there is a breakpoint, since the breakpoint + // handler might end up setting it again but unless it does we want the + // default to be false. + #[cfg(target_arch = "x86")] + let had_singlestep = stack.iret.eflags & (1 << 8) == 1 << 8; + #[cfg(target_arch = "x86_64")] + let had_singlestep = stack.iret.rflags & (1 << 8) == 1 << 8; + stack.set_singlestep(false); + + let mut token = unsafe { CleanLockToken::new() }; + if ptrace::breakpoint_callback(PTRACE_STOP_SINGLESTEP, None, &mut token).is_some() { + handled = true; + } else { + // There was no breakpoint, restore original value + stack.set_singlestep(had_singlestep); + } + + if !handled { + println!("Debug trap"); + stack.dump(); + excp_handler(Exception { + kind: 1, + ..Default::default() + }); + } +}); + +interrupt_stack!(non_maskable, @paranoid, |stack| { + #[cfg(target_arch = "x86_64")] + unsafe { crate::profiling::nmi_handler(stack) }; + + #[cfg(not(all(target_arch = "x86_64", feature = "profiling")))] + { + // TODO: This will likely deadlock + println!("Non-maskable interrupt"); + stack.dump(); + } +}); + +interrupt_stack!(breakpoint, |stack| { + // The processor lets EIP/RIP point to the instruction *after* int3, so + // unhandled breakpoint interrupt don't go in an infinite loop. But we + // throw SIGTRAP anyway, so that's not a problem. + // + // We have the following code to prevent + // - RIP from going out of sync with instructions + // - The user having to do 2 syscalls to replace the instruction at RIP + // - Having more compatibility glue for GDB than necessary + // + // Let's just follow Linux convention and let RIP be RIP-1, point to the + // int3 instruction. After all, it's the sanest thing to do. + #[cfg(target_arch = "x86")] + { + stack.iret.eip -= 1; + } + #[cfg(target_arch = "x86_64")] + { + stack.iret.rip -= 1; + } + + let mut token = unsafe { CleanLockToken::new() }; + if ptrace::breakpoint_callback(PTRACE_STOP_BREAKPOINT, None, &mut token).is_none() { + println!("Breakpoint trap"); + stack.dump(); + excp_handler(Exception { + kind: 3, + ..Default::default() + }); + } +}); + +interrupt_stack!(overflow, |stack| { + println!("Overflow trap"); + stack.trace(); + excp_handler(Exception { + kind: 4, + ..Default::default() + }); +}); + +interrupt_stack!(bound_range, |stack| { + println!("Bound range exceeded fault"); + stack.trace(); + excp_handler(Exception { + kind: 5, + ..Default::default() + }); +}); + +interrupt_stack!(invalid_opcode, |stack| { + println!("Invalid opcode fault"); + stack.trace(); + excp_handler(Exception { + kind: 6, + ..Default::default() + }); +}); + +interrupt_stack!(device_not_available, |stack| { + println!("Device not available fault"); + stack.trace(); + excp_handler(Exception { + kind: 7, + ..Default::default() + }); +}); + +interrupt_error!(double_fault, |stack, _code| { + println!("Double fault"); + stack.trace(); + unsafe { + loop { + interrupt::disable(); + interrupt::halt(); + } + } +}); + +interrupt_error!(invalid_tss, |stack, code| { + println!("Invalid TSS fault"); + stack.trace(); + excp_handler(Exception { + kind: 10, + code, + ..Default::default() + }); +}); + +interrupt_error!(segment_not_present, |stack, code| { + println!("Segment not present fault"); + stack.trace(); + excp_handler(Exception { + kind: 11, + code, + ..Default::default() + }); +}); + +interrupt_error!(stack_segment, |stack, code| { + println!("Stack segment fault"); + stack.trace(); + excp_handler(Exception { + kind: 12, + code, + ..Default::default() + }); +}); + +interrupt_error!(protection, |stack, code| { + println!("Protection fault code={:#0x}", code); + stack.trace(); + excp_handler(Exception { + kind: 13, + code, + ..Default::default() + }); +}); + +interrupt_error!(page, |stack, code| { + let cr2 = VirtualAddress::new(unsafe { x86::controlregs::cr2() }); + let arch_flags = PageFaultError::from_bits_truncate(code as u32); + let mut generic_flags = GenericPfFlags::empty(); + + generic_flags.set( + GenericPfFlags::PRESENT, + arch_flags.contains(PageFaultError::P), + ); + generic_flags.set( + GenericPfFlags::INVOLVED_WRITE, + arch_flags.contains(PageFaultError::WR), + ); + generic_flags.set( + GenericPfFlags::USER_NOT_SUPERVISOR, + arch_flags.contains(PageFaultError::US), + ); + generic_flags.set( + GenericPfFlags::INVL, + arch_flags.contains(PageFaultError::RSVD), + ); + generic_flags.set( + GenericPfFlags::INSTR_NOT_DATA, + arch_flags.contains(PageFaultError::ID), + ); + + #[cfg(target_arch = "x86")] + if crate::memory::page_fault_handler(&mut stack.inner, generic_flags, cr2).is_err() { + println!("Page fault: {:>08X} {:#?}", cr2.data(), arch_flags); + stack.trace(); + excp_handler(Exception { + kind: 14, + code, + address: cr2.data(), + }); + } + + #[cfg(target_arch = "x86_64")] + if crate::memory::page_fault_handler(stack, generic_flags, cr2).is_err() { + println!("Page fault: {:>016X} {:#?}", cr2.data(), arch_flags); + stack.trace(); + excp_handler(Exception { + kind: 14, + code, + address: cr2.data(), + }); + } +}); + +interrupt_stack!(fpu_fault, |stack| { + println!("FPU floating point fault"); + stack.trace(); + excp_handler(Exception { + kind: 16, + ..Default::default() + }); +}); + +interrupt_error!(alignment_check, |stack, code| { + println!("Alignment check fault"); + stack.trace(); + excp_handler(Exception { + kind: 17, + code, + ..Default::default() + }); +}); + +interrupt_stack!(machine_check, @paranoid, |stack| { + println!("Machine check fault"); + stack.trace(); + unsafe { + loop { + interrupt::disable(); + interrupt::halt(); + } + } +}); + +interrupt_stack!(simd, |stack| { + println!("SIMD floating point fault"); + let mut mxcsr = 0_usize; + unsafe { core::arch::asm!("stmxcsr [{}]", in(reg) core::ptr::addr_of_mut!(mxcsr)) }; + println!("MXCSR {:#0x}", mxcsr); + stack.trace(); + excp_handler(Exception { + kind: 19, + ..Default::default() + }); +}); + +interrupt_stack!(virtualization, |stack| { + println!("Virtualization fault"); + stack.trace(); + unsafe { + loop { + interrupt::disable(); + interrupt::halt(); + } + } +}); + +interrupt_error!(security, |stack, _code| { + println!("Security exception"); + stack.trace(); + unsafe { + loop { + interrupt::disable(); + interrupt::halt(); + } + } +}); diff --git a/src/arch/x86_shared/interrupt/ipi.rs b/src/arch/x86_shared/interrupt/ipi.rs new file mode 100644 index 0000000000..20e9d436a1 --- /dev/null +++ b/src/arch/x86_shared/interrupt/ipi.rs @@ -0,0 +1,28 @@ +use crate::{ + arch::device::local_apic::the_local_apic, context, percpu::PercpuBlock, sync::CleanLockToken, +}; + +interrupt!(wakeup, || { + unsafe { the_local_apic().eoi() }; +}); + +interrupt!(tlb, || { + PercpuBlock::current().maybe_handle_tlb_shootdown(); + + unsafe { the_local_apic().eoi() }; +}); + +interrupt!(switch, || { + unsafe { the_local_apic().eoi() }; + + let mut token = unsafe { CleanLockToken::new() }; + let _ = context::switch(&mut token); +}); + +interrupt!(pit, || { + unsafe { the_local_apic().eoi() }; + + // Switch after a sufficient amount of time since the last switch. + let mut token = unsafe { CleanLockToken::new() }; + context::switch::tick(&mut token); +}); diff --git a/src/arch/x86_shared/interrupt/irq.rs b/src/arch/x86_shared/interrupt/irq.rs new file mode 100644 index 0000000000..a0b376bc0d --- /dev/null +++ b/src/arch/x86_shared/interrupt/irq.rs @@ -0,0 +1,352 @@ +use core::sync::atomic::{AtomicUsize, Ordering}; + +use alloc::vec::Vec; + +use crate::{ + arch::{ + device::{ + ioapic, local_apic, pic, pit, + serial::{COM1, COM2}, + }, + ipi::{ipi, IpiKind, IpiTarget}, + }, + context::{self, timeout}, + percpu::PercpuBlock, + scheme::{irq::irq_trigger, serio::serio_input}, + sync::CleanLockToken, + time, +}; + +#[repr(u8)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum IrqMethod { + Pic = 0, + Apic = 1, +} + +static SPURIOUS_COUNT_IRQ7: AtomicUsize = AtomicUsize::new(0); +static SPURIOUS_COUNT_IRQ15: AtomicUsize = AtomicUsize::new(0); + +pub fn spurious_count_irq7() -> usize { + SPURIOUS_COUNT_IRQ7.load(Ordering::Relaxed) +} +pub fn spurious_count_irq15() -> usize { + SPURIOUS_COUNT_IRQ15.load(Ordering::Relaxed) +} +pub fn spurious_count() -> usize { + spurious_count_irq7() + spurious_count_irq15() +} +pub fn spurious_irq_resource(_token: &mut CleanLockToken) -> syscall::Result> { + match irq_method() { + IrqMethod::Apic => Ok(Vec::from(&b"(not implemented for APIC yet)"[..])), + IrqMethod::Pic => Ok(format!( + "{}\tIRQ7\n{}\tIRQ15\n{}\ttotal\n", + spurious_count_irq7(), + spurious_count_irq15(), + spurious_count() + ) + .into_bytes()), + } +} + +static IRQ_METHOD: AtomicUsize = AtomicUsize::new(IrqMethod::Pic as usize); + +pub fn set_irq_method(method: IrqMethod) { + IRQ_METHOD.store(method as usize, core::sync::atomic::Ordering::Release); +} + +fn irq_method() -> IrqMethod { + let raw = IRQ_METHOD.load(core::sync::atomic::Ordering::Acquire); + + match raw { + 0 => IrqMethod::Pic, + 1 => IrqMethod::Apic, + _ => unreachable!(), + } +} + +/// Notify the IRQ scheme that an IRQ has been registered. This should mask the IRQ until the +/// scheme user unmasks it ("acknowledges" it). +unsafe fn trigger(irq: u8) { + unsafe { + match irq_method() { + IrqMethod::Pic => { + if irq < 16 { + pic_mask(irq) + } + } + IrqMethod::Apic => ioapic_mask(irq), + } + let mut token = CleanLockToken::new(); + irq_trigger(irq, &mut token); + } +} + +/// Unmask the IRQ. This is called from the IRQ scheme, which does this when a user process has +/// processed the IRQ. +pub unsafe fn acknowledge(irq: usize) { + unsafe { + match irq_method() { + IrqMethod::Pic => { + if irq < 16 { + pic_unmask(irq) + } + } + IrqMethod::Apic => ioapic_unmask(irq), + } + } +} + +/// Sends an end-of-interrupt, so that the interrupt controller can go on to the next one. +pub unsafe fn eoi(irq: u8) { + unsafe { + PercpuBlock::current().stats.add_irq(irq); + + match irq_method() { + IrqMethod::Pic => { + if irq < 16 { + pic_eoi(irq) + } + } + IrqMethod::Apic => lapic_eoi(), + } + } +} + +unsafe fn pic_mask(irq: u8) { + unsafe { + debug_assert!(irq < 16); + + if irq >= 8 { + pic::slave().mask_set(irq - 8); + } else { + pic::master().mask_set(irq); + } + } +} + +unsafe fn ioapic_mask(irq: u8) { + unsafe { + ioapic::mask(irq); + } +} + +unsafe fn pic_eoi(irq: u8) { + unsafe { + debug_assert!(irq < 16); + + if irq >= 8 { + pic::master().ack(); + pic::slave().ack(); + } else { + pic::master().ack(); + } + } +} + +unsafe fn lapic_eoi() { + unsafe { local_apic::the_local_apic().eoi() } +} + +unsafe fn pic_unmask(irq: usize) { + unsafe { + debug_assert!(irq < 16); + + if irq >= 8 { + pic::slave().mask_clear(irq as u8 - 8); + } else { + pic::master().mask_clear(irq as u8); + } + } +} + +unsafe fn ioapic_unmask(irq: usize) { + unsafe { + ioapic::unmask(irq as u8); + } +} + +interrupt_stack!(pit_stack, |_stack| { + // Saves CPU time by not sending IRQ event irq_trigger(0); + + let mut token = unsafe { CleanLockToken::new() }; + { + *time::OFFSET.write(token.token()) += pit::RATE; + } + + unsafe { eoi(0) }; + + // Wake up other CPUs + ipi(IpiKind::Pit, IpiTarget::Other); + + // Any better way of doing this? + timeout::trigger(&mut token); + + // Switch after a sufficient amount of time since the last switch. + context::switch::tick(&mut token); +}); + +interrupt!(keyboard, || { + let data: u8; + unsafe { core::arch::asm!("in al, 0x60", out("al") data) }; + + unsafe { eoi(1) }; + + let mut token = unsafe { CleanLockToken::new() }; + serio_input(0, data, &mut token); +}); + +interrupt!(cascade, || { + // No need to do any operations on cascade + unsafe { eoi(2) }; +}); + +interrupt!(com2, || { + let mut token = unsafe { CleanLockToken::new() }; + COM2.lock().receive(&mut token); + unsafe { eoi(3) }; +}); + +interrupt!(com1, || { + let mut token = unsafe { CleanLockToken::new() }; + COM1.lock().receive(&mut token); + unsafe { eoi(4) }; +}); + +interrupt!(lpt2, || { + unsafe { + trigger(5); + eoi(5); + } +}); + +interrupt!(floppy, || { + unsafe { + trigger(6); + eoi(6); + } +}); + +interrupt!(lpt1, || { + unsafe { + if irq_method() == IrqMethod::Pic && pic::master().isr() & (1 << 7) == 0 { + // the IRQ was spurious, ignore it but increment a counter. + SPURIOUS_COUNT_IRQ7.fetch_add(1, Ordering::Relaxed); + return; + } + trigger(7); + eoi(7); + } +}); + +interrupt!(rtc, || { + unsafe { + trigger(8); + eoi(8); + } +}); + +interrupt!(pci1, || { + unsafe { + trigger(9); + eoi(9); + } +}); + +interrupt!(pci2, || { + unsafe { + trigger(10); + eoi(10); + } +}); + +interrupt!(pci3, || { + unsafe { + trigger(11); + eoi(11); + } +}); + +interrupt!(mouse, || { + let data: u8; + unsafe { core::arch::asm!("in al, 0x60", out("al") data) }; + + unsafe { eoi(12) }; + + let mut token = unsafe { CleanLockToken::new() }; + serio_input(1, data, &mut token); +}); + +interrupt!(fpu, || { + unsafe { + trigger(13); + eoi(13); + } +}); + +interrupt!(ata1, || { + unsafe { + trigger(14); + eoi(14); + } +}); + +interrupt!(ata2, || { + unsafe { + if irq_method() == IrqMethod::Pic && pic::slave().isr() & (1 << 7) == 0 { + SPURIOUS_COUNT_IRQ15.fetch_add(1, Ordering::Relaxed); + pic::master().ack(); + return; + } + trigger(15); + eoi(15); + } +}); + +interrupt!(lapic_timer, || { + println!("Local apic timer interrupt"); + unsafe { lapic_eoi() }; +}); +#[cfg(feature = "profiling")] +interrupt!(aux_timer, || { + unsafe { lapic_eoi() }; + crate::ipi::ipi(IpiKind::Profile, IpiTarget::Other); +}); + +interrupt!(lapic_error, || { + error!("Local apic internal error: ESR={:#0x}", unsafe { + local_apic::the_local_apic().esr() + }); + unsafe { lapic_eoi() }; +}); + +interrupt_error!(generic_irq, |_stack, code| { + let mut token = unsafe { CleanLockToken::new() }; + + // The reason why 128 is subtracted and added from the code, is that PUSH imm8 sign-extends the + // value, and the longer PUSH imm32 would make the generic_interrupts table twice as large + // (containing lots of useless NOPs). + irq_trigger((code as i32).wrapping_add(128) as u8, &mut token); + + unsafe { lapic_eoi() }; +}); + +core::arch::global_asm!(" + .globl __generic_interrupts_start + .globl __generic_interrupts_end + .p2align 3 +__generic_interrupts_start: + n = 0 + .rept 224 + push (n - 128) + jmp {} + .p2align 3 + n = n + 1 + .endr +__generic_interrupts_end: +", sym generic_irq); + +unsafe extern "C" { + pub fn __generic_interrupts_start(); + pub fn __generic_interrupts_end(); +} diff --git a/src/arch/x86_shared/interrupt/mod.rs b/src/arch/x86_shared/interrupt/mod.rs new file mode 100644 index 0000000000..172bad3ba9 --- /dev/null +++ b/src/arch/x86_shared/interrupt/mod.rs @@ -0,0 +1,44 @@ +//! Interrupt instructions + +pub mod exception; +pub mod ipi; +pub mod irq; +pub mod trace; + +pub use super::idt::{available_irqs_iter, is_reserved, set_reserved}; + +/// Clear interrupts +#[inline(always)] +pub unsafe fn disable() { + unsafe { + core::arch::asm!("cli", options(nomem, nostack)); + } +} + +/// Set interrupts and halt +/// This will atomically wait for the next interrupt +/// Performing enable followed by halt is not guaranteed to be atomic, use this instead! +#[inline(always)] +pub unsafe fn enable_and_halt() { + unsafe { + core::arch::asm!("sti; hlt", options(nomem, nostack)); + } +} + +/// Set interrupts and nop +/// This will enable interrupts and allow the IF flag to be processed +/// Simply enabling interrupts does not guarantee that they will trigger, use this instead! +#[inline(always)] +pub unsafe fn enable_and_nop() { + unsafe { + core::arch::asm!("sti; nop", options(nomem, nostack)); + } +} + +/// Halt instruction +#[inline(always)] +pub unsafe fn halt() { + unsafe { + core::arch::asm!("hlt", options(nomem, nostack)); + } +} diff --git a/src/arch/x86_shared/interrupt/trace.rs b/src/arch/x86_shared/interrupt/trace.rs new file mode 100644 index 0000000000..5046b6f638 --- /dev/null +++ b/src/arch/x86_shared/interrupt/trace.rs @@ -0,0 +1,33 @@ +pub struct StackTrace { + pub fp: usize, + pub pc_ptr: *const usize, +} + +impl StackTrace { + #[inline(always)] + pub unsafe fn start() -> Option { + unsafe { + let mut fp: usize; + #[cfg(target_arch = "x86")] + core::arch::asm!("mov {}, ebp", out(reg) fp); + #[cfg(target_arch = "x86_64")] + core::arch::asm!("mov {}, rbp", out(reg) fp); + let pc_ptr = fp.checked_add(size_of::())?; + Some(Self { + fp, + pc_ptr: pc_ptr as *const usize, + }) + } + } + + pub unsafe fn next(self) -> Option { + unsafe { + let fp = *(self.fp as *const usize); + let pc_ptr = fp.checked_add(size_of::())?; + Some(Self { + fp, + pc_ptr: pc_ptr as *const usize, + }) + } + } +} diff --git a/src/arch/x86_shared/ipi.rs b/src/arch/x86_shared/ipi.rs new file mode 100644 index 0000000000..f38db763c4 --- /dev/null +++ b/src/arch/x86_shared/ipi.rs @@ -0,0 +1,53 @@ +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum IpiKind { + Wakeup = 0x40, + Tlb = 0x41, + Switch = 0x42, + Pit = 0x43, + + #[cfg(feature = "profiling")] + Profile = 0x44, +} + +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum IpiTarget { + Current = 1, + All = 2, + Other = 3, +} + +#[inline(always)] +pub fn ipi(kind: IpiKind, target: IpiTarget) { + use crate::arch::device::local_apic::the_local_apic; + + if cfg!(not(feature = "multi_core")) { + return; + } + + #[cfg(feature = "profiling")] + if matches!(kind, IpiKind::Profile) { + let icr = ((target as u64) << 18) | (1 << 14) | (0b100 << 8); + unsafe { the_local_apic().set_icr(icr) }; + return; + } + + let icr = ((target as u64) << 18) | (1 << 14) | (kind as u64); + unsafe { the_local_apic().set_icr(icr) }; +} + +#[inline(always)] +pub fn ipi_single(kind: IpiKind, target: &crate::percpu::PercpuBlock) { + use crate::arch::device::local_apic::the_local_apic; + + if cfg!(not(feature = "multi_core")) { + return; + } + + if let Some(apic_id) = target.misc_arch_info.apic_id_opt.get() { + unsafe { + the_local_apic().ipi(apic_id, kind); + } + } +} diff --git a/src/arch/x86_shared/mod.rs b/src/arch/x86_shared/mod.rs new file mode 100644 index 0000000000..e3c30501b8 --- /dev/null +++ b/src/arch/x86_shared/mod.rs @@ -0,0 +1,45 @@ +/// CPUID wrapper +pub mod cpuid; + +/// Debugging support +pub mod debug; + +/// Devices +pub mod device; + +/// Global descriptor table +pub mod gdt; + +/// Interrupt descriptor table +pub mod idt; + +/// Interrupt instructions +pub mod interrupt; + +/// Inter-processor interrupts +pub mod ipi; + +/// Paging +pub mod paging; + +/// Page table isolation +pub mod pti; + +/// Initialization and start function +pub mod start; + +/// Stop function +pub mod stop; + +pub mod time; + +#[cfg(target_arch = "x86")] +pub use ::rmm::x86::X86Arch as CurrentRmmArch; + +#[cfg(target_arch = "x86_64")] +pub use ::rmm::x86_64::X8664Arch as CurrentRmmArch; + +// Flags +pub mod flags { + pub const FLAG_SINGLESTEP: usize = 1 << 8; +} diff --git a/src/arch/x86_shared/paging.rs b/src/arch/x86_shared/paging.rs new file mode 100644 index 0000000000..cbfa2ee897 --- /dev/null +++ b/src/arch/x86_shared/paging.rs @@ -0,0 +1,10 @@ +/// Initialize PAT +#[cold] +pub unsafe fn init() { + unsafe { + #[cfg(target_arch = "x86")] + rmm::x86::init_pat(); + #[cfg(target_arch = "x86_64")] + rmm::x86_64::init_pat(); + } +} diff --git a/src/arch/x86_shared/pti.rs b/src/arch/x86_shared/pti.rs new file mode 100644 index 0000000000..0ecaf79854 --- /dev/null +++ b/src/arch/x86_shared/pti.rs @@ -0,0 +1,86 @@ +#[cfg(feature = "pti")] +use core::ptr; + +#[cfg(feature = "pti")] +use crate::memory::Frame; +#[cfg(feature = "pti")] +use crate::paging::ActivePageTable; + +#[cfg(feature = "pti")] +#[thread_local] +pub static mut PTI_CPU_STACK: [u8; 256] = [0; 256]; + +#[cfg(feature = "pti")] +#[thread_local] +pub static mut PTI_CONTEXT_STACK: usize = 0; + +#[cfg(feature = "pti")] +#[inline(always)] +unsafe fn switch_stack(old: usize, new: usize) { + let old_rsp: usize; + asm!("", out("rsp") old_rsp); + + let offset_rsp = old - old_rsp; + + let new_rsp = new - offset_rsp; + + ptr::copy_nonoverlapping(old_rsp as *const u8, new_rsp as *mut u8, offset_rsp); + + asm!("", out("rsp") new_rsp); +} + +#[cfg(feature = "pti")] +#[inline(always)] +pub unsafe fn map() { + // { + // let mut active_table = unsafe { ActivePageTable::new() }; + // + // // Map kernel heap + // let address = active_table.p4()[::KERNEL_HEAP_PML4].address(); + // let frame = Frame::containing(address); + // let mut flags = active_table.p4()[::KERNEL_HEAP_PML4].flags(); + // flags.remove(EntryFlags::PRESENT); + // active_table.p4_mut()[::KERNEL_HEAP_PML4].set(frame, flags); + // + // // Reload page tables + // active_table.flush_all(); + // } + + // Switch to per-context stack + switch_stack( + PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len(), + PTI_CONTEXT_STACK, + ); +} + +#[cfg(feature = "pti")] +#[inline(always)] +pub unsafe extern "C" fn unmap() { + // Switch to per-CPU stack + switch_stack( + PTI_CONTEXT_STACK, + PTI_CPU_STACK.as_ptr() as usize + PTI_CPU_STACK.len(), + ); + + // { + // let mut active_table = unsafe { ActivePageTable::new() }; + // + // // Unmap kernel heap + // let address = active_table.p4()[::KERNEL_HEAP_PML4].address(); + // let frame = Frame::containing(address); + // let mut flags = active_table.p4()[::KERNEL_HEAP_PML4].flags(); + // flags.insert(EntryFlags::PRESENT); + // active_table.p4_mut()[::KERNEL_HEAP_PML4].set(frame, flags); + // + // // Reload page tables + // active_table.flush_all(); + // } +} + +#[cfg(not(feature = "pti"))] +#[inline(always)] +pub unsafe fn map() {} + +#[cfg(not(feature = "pti"))] +#[inline(always)] +pub unsafe extern "C" fn unmap() {} diff --git a/src/arch/x86_shared/start.rs b/src/arch/x86_shared/start.rs new file mode 100644 index 0000000000..7a7c0ae815 --- /dev/null +++ b/src/arch/x86_shared/start.rs @@ -0,0 +1,221 @@ +//! This function is where the kernel sets up IRQ handlers +//! It is incredibly unsafe, and should be minimal in nature +//! It must create the IDT with the correct entries, those entries are +//! defined in other files inside of the `arch` module +use core::{arch::naked_asm, cell::SyncUnsafeCell, mem::offset_of}; + +use crate::{ + allocator, + arch::{device, gdt, idt, interrupt, paging}, + cpu_set::LogicalCpuId, + devices::graphical_debug, + startup::KernelArgs, +}; + +/// Test of zero values in BSS. +static BSS_TEST_ZERO: SyncUnsafeCell = SyncUnsafeCell::new(0); +/// Test of non-zero values in data. +static DATA_TEST_NONZERO: SyncUnsafeCell = SyncUnsafeCell::new(usize::MAX); + +#[repr(C, align(16))] +struct StackAlign(T); + +static STACK: SyncUnsafeCell> = + SyncUnsafeCell::new(StackAlign([0; 128 * 1024])); + +// FIXME use extern "custom" +#[unsafe(naked)] +#[unsafe(no_mangle)] +extern "C" fn kstart() { + naked_asm!( + #[cfg(target_arch = "x86")] + " + // BSS should already be zero + cmp dword ptr [{bss_test_zero}], 0 + jne .Lkstart_crash + cmp dword ptr [{data_test_nonzero}], 0 + je .Lkstart_crash + + mov eax, [esp + 4] + lea esp, [{stack}+{stack_size}-16] + mov [esp + 4], eax + mov [esp + 8], esp + + jmp {start} + + .Lkstart_crash: + xor eax, eax + jmp eax + ", + + #[cfg(target_arch = "x86_64")] + " + // BSS should already be zero + cmp qword ptr [rip + {bss_test_zero}], 0 + jne .Lkstart_crash + cmp qword ptr [rip + {data_test_nonzero}], 0 + je .Lkstart_crash + + // Note: The System V ABI requires the stack to be aligned to 16 bytes + // before the call instruction. As we jump rather than call it has to + // be offset by 8 bytes. Additionally reserve a bit more space at the + // end of the stack to ensure that the start function returns to + // address 0. + lea rsp, [rip + {stack}+{stack_size}-24] + mov rsi, rsp + + jmp {start} + + .Lkstart_crash: + xor rax, rax + jmp rax + ", + + bss_test_zero = sym BSS_TEST_ZERO, + data_test_nonzero = sym DATA_TEST_NONZERO, + stack = sym STACK, + stack_size = const size_of_val(&STACK), + start = sym start, + ); +} + +/// The entry to Rust, all things must be initialized +unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! { + unsafe { + let bootstrap = { + let args = args_ptr.read(); + + // Set up serial debug + device::serial::init(); + + // Set up graphical debug + graphical_debug::init(args.env()); + + info!("Redox OS starting..."); + args.print(); + + // Set up GDT + gdt::init_bsp(stack_end); + + // Set up IDT + idt::init_bsp(); + + // Initialize RMM + #[cfg(target_arch = "x86")] + crate::startup::memory::init(&args, Some(0x100000), Some(0x40000000)); + #[cfg(target_arch = "x86_64")] + crate::startup::memory::init(&args, Some(0x100000), None); + + // Initialize paging + paging::init(); + + #[cfg(target_arch = "x86_64")] + crate::arch::alternative::early_init(true); + + // Set up syscall instruction + interrupt::syscall::init(); + + // Setup kernel heap + allocator::init(); + + // Activate memory logging + crate::log::init(); + + // Initialize miscellaneous processor features + #[cfg(target_arch = "x86_64")] + crate::arch::misc::init(LogicalCpuId::BSP); + + // Initialize devices + device::init(); + + // Read ACPI tables, starts APs + if cfg!(feature = "acpi") { + crate::acpi::init(args.acpi_rsdp()); + device::init_after_acpi(); + } + crate::profiling::init(); + + // Initialize all of the non-core devices not otherwise needed to complete initialization + device::init_noncore(); + + args.bootstrap() + }; + + crate::startup::kmain(bootstrap); + } +} + +pub struct KernelArgsAp { + pub stack_end: *mut u8, + pub cpu_id: LogicalCpuId, + pub pcr_ptr: *mut gdt::ProcessorControlRegion, + pub idt_ptr: *mut idt::Idt, +} + +// FIXME use extern "custom" +#[unsafe(naked)] +pub extern "C" fn kstart_ap() { + naked_asm!( + #[cfg(target_arch = "x86")] + " + mov esp, dword ptr [edi + {args_stack}] + mov [esp + 4], edi + mov [esp + 8], esp + + jmp {start_ap} + ", + + #[cfg(target_arch = "x86_64")] + " + // Note: The System V ABI requires the stack to be aligned to 16 bytes + // before the call instruction. As we jump rather than call it has to + // be offset by 8 bytes. Additionally reserve a bit more space at the + // end of the stack to ensure that the start function returns to + // address 0. + mov rax, qword ptr [rdi + {args_stack}] + lea rsp, [rax - 24] + + jmp {start_ap} + ", + + args_stack = const offset_of!(KernelArgsAp, stack_end), + start_ap = sym start_ap, + ); +} + +/// Entry to rust for an AP +unsafe extern "C" fn start_ap(args_ptr: *const KernelArgsAp) -> ! { + unsafe { + let cpu_id = { + let args = &*args_ptr; + + // Set up GDT + gdt::install_pcr(args.pcr_ptr); + + // Set up IDT + idt::install_idt(args.idt_ptr); + + // Initialize paging + paging::init(); + + crate::profiling::init(); + + #[cfg(target_arch = "x86_64")] + crate::arch::alternative::early_init(false); + + // Set up syscall instruction + interrupt::syscall::init(); + + // Initialize miscellaneous processor features + #[cfg(target_arch = "x86_64")] + crate::arch::misc::init(args.cpu_id); + + // Initialize devices (for AP) + device::init_ap(); + + args.cpu_id + }; + + crate::startup::kmain_ap(cpu_id); + } +} diff --git a/src/arch/x86_shared/stop.rs b/src/arch/x86_shared/stop.rs new file mode 100644 index 0000000000..498345bbc2 --- /dev/null +++ b/src/arch/x86_shared/stop.rs @@ -0,0 +1,122 @@ +use crate::{ + context, + scheme::acpi, + sync::CleanLockToken, + syscall::io::{Io, Pio}, + time, +}; + +pub unsafe fn kreset() -> ! { + unsafe { + info!("kreset"); + + // 8042 reset + { + println!("Reset with 8042"); + let mut port = Pio::::new(0x64); + while port.readf(2) {} + port.write(0xFE); + } + + emergency_reset(); + } +} + +#[cfg(target_arch = "x86")] +pub unsafe fn emergency_reset() -> ! { + unsafe { + // Use triple fault to guarantee reset + core::arch::asm!( + " + cli + sidt [esp+16] + // set IDT limit to zero + mov word ptr [esp+16], 0 + lidt [esp+16] + int $3 + ", + options(noreturn) + ); + } +} + +#[cfg(target_arch = "x86_64")] +pub unsafe fn emergency_reset() -> ! { + unsafe { + // Use triple fault to guarantee reset + core::arch::asm!( + " + cli + sidt [rsp+16] + // set IDT limit to zero + mov word ptr [rsp+16], 0 + lidt [rsp+16] + int $3 + ", + options(noreturn) + ); + } +} + +fn userspace_acpi_shutdown(token: &mut CleanLockToken) { + if cfg!(not(feature = "acpi")) { + return; + } + + info!("Notifying any potential ACPI driver"); + // Tell whatever driver that handles ACPI, that it should enter the S5 state (i.e. + // shutdown). + if !acpi::register_kstop(token) { + // There was no context to switch to. + info!("No ACPI driver was alive to handle shutdown."); + return; + } + info!("Waiting one second for ACPI driver to run the shutdown sequence."); + let initial = time::monotonic(token); + + // Since this driver is a userspace process, and we do not use any magic like directly + // context switching, we have to wait for the userspace driver to complete, with a timeout. + // + // We switch context, and wait for one second. + loop { + // TODO: Switch directly to whichever process is handling the kstop pipe. We would add an + // event flag like EVENT_DIRECT, which has already been suggested for IRQs. + // TODO: Waitpid with timeout? Because, what if the ACPI driver would crash? + let _ = context::switch(token); + + let current = time::monotonic(token); + if current - initial > time::NANOS_PER_SEC { + info!("Timeout reached, thus falling back to other shutdown methods."); + return; + } + } +} + +pub unsafe fn kstop(token: &mut CleanLockToken) -> ! { + unsafe { + info!("Running kstop()"); + + userspace_acpi_shutdown(token); + + // Magic shutdown code for bochs and qemu (older versions). + for c in "Shutdown".bytes() { + let port = 0x8900; + println!("Shutdown with outb(0x{:X}, '{}')", port, c as char); + Pio::::new(port).write(c); + } + + // Magic shutdown using qemu default ACPI method + { + let port = 0x604; + let data = 0x2000; + println!("Shutdown with outb(0x{:X}, 0x{:X})", port, data); + Pio::::new(port).write(data); + } + + // Magic code for VMWare. Also a hard lock. + println!("Shutdown with cli hlt"); + loop { + core::arch::asm!("cli; hlt"); + } + } +} diff --git a/src/arch/x86_shared/time.rs b/src/arch/x86_shared/time.rs new file mode 100644 index 0000000000..79c3f13c74 --- /dev/null +++ b/src/arch/x86_shared/time.rs @@ -0,0 +1,69 @@ +use super::device::{hpet, pit}; +use crate::sync::CleanLockToken; + +pub fn monotonic_absolute(token: &mut CleanLockToken) -> u128 { + // The paravirtualized TSC is already guaranteed to be monotonic, and thus doesn't need to be + // readjusted. + #[cfg(feature = "x86_kvm_pv")] + if let Some(ns) = super::device::tsc::monotonic_absolute() { + return ns; + } + + let offset = { *crate::time::OFFSET.read(token.token()) }; + offset + hpet_or_pit() +} +fn hpet_or_pit() -> u128 { + if cfg!(feature = "acpi") + && let Some(ref hpet) = *crate::acpi::ACPI_TABLE.hpet.read() + { + //TODO: handle rollover? + //TODO: improve performance + + // Current count + let counter = unsafe { hpet.read_u64(hpet::MAIN_COUNTER_OFFSET) }; + // Comparator holds next interrupt count + let comparator = unsafe { hpet.read_u64(hpet::T0_COMPARATOR_OFFSET) }; + // Get period in femtoseconds + let capability = unsafe { hpet.read_u64(hpet::CAPABILITY_OFFSET) }; + + // There seems to be a bug in qemu on macos that causes the calculation to produce 0 for + // period_fs and hence a divide by zero calculating the divisor - workaround it while we + // try and get a fix from qemu: https://gitlab.com/qemu-project/qemu/-/issues/1570 + let mut period_fs = capability >> 32; + if period_fs == 0 { + period_fs = 10_000_000; + } + + // Calculate divisor + let divisor = (pit::RATE as u64 * 1_000_000) / period_fs; + // Calculate last interrupt + let last_interrupt = comparator.saturating_sub(divisor); + // Calculate ticks since last interrupt + let elapsed = counter.saturating_sub(last_interrupt); + // Calculate nanoseconds since last interrupt + return (elapsed as u128 * period_fs as u128) / 1_000_000; + } + // Read ticks since last interrupt + let elapsed = unsafe { pit::read() }; + // Calculate nanoseconds since last interrupt + (elapsed as u128 * pit::PERIOD_FS) / 1_000_000 +} + +pub fn monotonic_resolution() -> u128 { + #[cfg(feature = "x86_kvm_pv")] + if super::device::tsc::monotonic_absolute().is_some() { + return 1; + } + + if let Some(ref hpet) = *crate::acpi::ACPI_TABLE.hpet.read() { + let capability = unsafe { hpet.read_u64(hpet::CAPABILITY_OFFSET) }; + let mut period_fs = capability >> 32; + if period_fs == 0 { + period_fs = 10_000_000; + } + + return (period_fs as u128) / 1_000_000; + } + + pit::PERIOD_FS / 1_000_000 +} diff --git a/src/asm/x86/trampoline.asm b/src/asm/x86/trampoline.asm new file mode 100644 index 0000000000..8002009d2b --- /dev/null +++ b/src/asm/x86/trampoline.asm @@ -0,0 +1,160 @@ +; trampoline for bringing up APs +; compiled with nasm by build.rs, and included in src/acpi/madt.rs + +ORG 0x8000 +SECTION .text +USE16 + +trampoline: + jmp short startup_ap + times 8 - ($ - trampoline) nop + .ready: dq 0 + .args_ptr: dq 0 + .page_table: dq 0 + .code: dq 0 + +startup_ap: + cli + + xor ax, ax + mov ds, ax + mov es, ax + mov ss, ax + + ; initialize stack to invalid value + mov sp, 0 + + ; cr3 holds pointer to PML4 + mov edi, [trampoline.page_table] + mov cr3, edi + + ; enable FPU + mov eax, cr0 + and al, 11110011b ; Clear task switched (3) and emulation (2) + or al, 00100010b ; Set numeric error (5) monitor co-processor (1) + mov cr0, eax + + ; 9: FXSAVE/FXRSTOR + ; 7: Page Global + ; 4: Page Size Extension + mov eax, cr4 + or eax, 1 << 9 | 1 << 7 | 1 << 4 + mov cr4, eax + + ; initialize floating point registers + fninit + + ; load protected mode GDT + lgdt [gdtr] + + ;enabling paging and protection simultaneously + mov ebx, cr0 + ; 31: Paging + ; 16: write protect kernel + ; 0: Protected Mode + or ebx, 1 << 31 | 1 << 16 | 1 + mov cr0, ebx + + ; far jump to enable Protected Mode and load CS with 32 bit segment + jmp gdt.kernel_code:protected_mode_ap + +USE32 +protected_mode_ap: + mov eax, gdt.kernel_data + mov ds, eax + mov es, eax + mov fs, eax + mov gs, eax + mov ss, eax + + mov edi, [trampoline.args_ptr] + + mov eax, [trampoline.code] + mov dword [trampoline.ready], 1 + jmp eax + +struc GDTEntry + .limitl resw 1 + .basel resw 1 + .basem resb 1 + .attribute resb 1 + .flags__limith resb 1 + .baseh resb 1 +endstruc + +attrib: + .present equ 1 << 7 + .ring1 equ 1 << 5 + .ring2 equ 1 << 6 + .ring3 equ 1 << 5 | 1 << 6 + .user equ 1 << 4 +;user + .code equ 1 << 3 +; code + .conforming equ 1 << 2 + .readable equ 1 << 1 +; data + .expand_down equ 1 << 2 + .writable equ 1 << 1 + .accessed equ 1 << 0 +;system +; legacy + .tssAvailabe16 equ 0x1 + .ldt equ 0x2 + .tssBusy16 equ 0x3 + .call16 equ 0x4 + .task equ 0x5 + .interrupt16 equ 0x6 + .trap16 equ 0x7 + .tssAvailabe32 equ 0x9 + .tssBusy32 equ 0xB + .call32 equ 0xC + .interrupt32 equ 0xE + .trap32 equ 0xF +; long mode + .ldt32 equ 0x2 + .tssAvailabe64 equ 0x9 + .tssBusy64 equ 0xB + .call64 equ 0xC + .interrupt64 equ 0xE + .trap64 equ 0xF + +flags: + .granularity equ 1 << 7 + .available equ 1 << 4 +;user + .default_operand_size equ 1 << 6 +; code + .long_mode equ 1 << 5 +; data + .reserved equ 1 << 5 + +gdtr: + dw gdt.end + 1 ; size + dq gdt ; offset + +gdt: +.null equ $ - gdt + dq 0 + +.kernel_code equ $ - gdt +istruc GDTEntry + at GDTEntry.limitl, dw 0xFFFF + at GDTEntry.basel, dw 0 + at GDTEntry.basem, db 0 + at GDTEntry.attribute, db attrib.present | attrib.user | attrib.code | attrib.readable + at GDTEntry.flags__limith, db 0xF | flags.granularity | flags.default_operand_size + at GDTEntry.baseh, db 0 +iend + +.kernel_data equ $ - gdt +istruc GDTEntry + at GDTEntry.limitl, dw 0xFFFF + at GDTEntry.basel, dw 0 + at GDTEntry.basem, db 0 + at GDTEntry.attribute, db attrib.present | attrib.user | attrib.writable + at GDTEntry.flags__limith, db 0xF | flags.granularity | flags.default_operand_size + at GDTEntry.baseh, db 0 +iend + +.end equ $ - gdt diff --git a/src/asm/x86_64/trampoline.asm b/src/asm/x86_64/trampoline.asm new file mode 100644 index 0000000000..bc762e2b84 --- /dev/null +++ b/src/asm/x86_64/trampoline.asm @@ -0,0 +1,168 @@ +; trampoline for bringing up APs +; compiled with nasm by build.rs, and included in src/acpi/madt.rs + +ORG 0x8000 +SECTION .text +USE16 + +trampoline: + jmp short startup_ap + times 8 - ($ - trampoline) nop + .ready: dq 0 + .args_ptr: dq 0 + .page_table: dq 0 + .code: dq 0 + +startup_ap: + cli + + xor ax, ax + mov ds, ax + mov es, ax + mov ss, ax + + ; initialize stack to invalid value + mov sp, 0 + + ; cr3 holds pointer to PML4 + mov edi, [trampoline.page_table] + mov cr3, edi + + ; enable FPU + mov eax, cr0 + and al, 11110011b ; Clear task switched (3) and emulation (2) + or al, 00100010b ; Set numeric error (5) monitor co-processor (1) + mov cr0, eax + + ; 9: FXSAVE/FXRSTOR + ; 7: Page Global + ; 5: Page Address Extension + ; 4: Page Size Extension + mov eax, cr4 + or eax, 1 << 9 | 1 << 7 | 1 << 5 | 1 << 4 + mov cr4, eax + + ; initialize floating point registers + fninit + + ; load protected mode GDT + lgdt [gdtr] + + ; enable long mode + mov ecx, 0xC0000080 ; Read from the EFER MSR. + rdmsr + or eax, 1 << 11 | 1 << 8 ; Set the Long-Mode-Enable and NXE bit. + wrmsr + + ; enabling paging and protection simultaneously + mov ebx, cr0 + ; 31: Paging + ; 16: write protect kernel + ; 0: Protected Mode + or ebx, 1 << 31 | 1 << 16 | 1 + mov cr0, ebx + + ; far jump to enable Long Mode and load CS with 64 bit segment + jmp gdt.kernel_code:long_mode_ap + +USE64 +long_mode_ap: + mov rax, gdt.kernel_data + mov ds, rax + mov es, rax + mov fs, rax + mov gs, rax + mov ss, rax + + mov rdi, [trampoline.args_ptr] + + mov rax, [trampoline.code] + mov qword [trampoline.ready], 1 + jmp rax + +struc GDTEntry + .limitl resw 1 + .basel resw 1 + .basem resb 1 + .attribute resb 1 + .flags__limith resb 1 + .baseh resb 1 +endstruc + +attrib: + .present equ 1 << 7 + .ring1 equ 1 << 5 + .ring2 equ 1 << 6 + .ring3 equ 1 << 5 | 1 << 6 + .user equ 1 << 4 +;user + .code equ 1 << 3 +; code + .conforming equ 1 << 2 + .readable equ 1 << 1 +; data + .expand_down equ 1 << 2 + .writable equ 1 << 1 + .accessed equ 1 << 0 +;system +; legacy + .tssAvailabe16 equ 0x1 + .ldt equ 0x2 + .tssBusy16 equ 0x3 + .call16 equ 0x4 + .task equ 0x5 + .interrupt16 equ 0x6 + .trap16 equ 0x7 + .tssAvailabe32 equ 0x9 + .tssBusy32 equ 0xB + .call32 equ 0xC + .interrupt32 equ 0xE + .trap32 equ 0xF +; long mode + .ldt32 equ 0x2 + .tssAvailabe64 equ 0x9 + .tssBusy64 equ 0xB + .call64 equ 0xC + .interrupt64 equ 0xE + .trap64 equ 0xF + +flags: + .granularity equ 1 << 7 + .available equ 1 << 4 +;user + .default_operand_size equ 1 << 6 +; code + .long_mode equ 1 << 5 +; data + .reserved equ 1 << 5 + +gdtr: + dw gdt.end + 1 ; size + dq gdt ; offset + +gdt: +.null equ $ - gdt + dq 0 + +.kernel_code equ $ - gdt +istruc GDTEntry + at GDTEntry.limitl, dw 0 + at GDTEntry.basel, dw 0 + at GDTEntry.basem, db 0 + at GDTEntry.attribute, db attrib.present | attrib.user | attrib.code + at GDTEntry.flags__limith, db flags.long_mode + at GDTEntry.baseh, db 0 +iend + +.kernel_data equ $ - gdt +istruc GDTEntry + at GDTEntry.limitl, dw 0 + at GDTEntry.basel, dw 0 + at GDTEntry.basem, db 0 +; AMD System Programming Manual states that the writeable bit is ignored in long mode, but ss can not be set to this descriptor without it + at GDTEntry.attribute, db attrib.present | attrib.user | attrib.writable + at GDTEntry.flags__limith, db 0 + at GDTEntry.baseh, db 0 +iend + +.end equ $ - gdt diff --git a/src/common/aligned_box.rs b/src/common/aligned_box.rs new file mode 100644 index 0000000000..578e0f5c96 --- /dev/null +++ b/src/common/aligned_box.rs @@ -0,0 +1,128 @@ +use core::alloc::{GlobalAlloc, Layout}; + +use crate::memory::Enomem; + +// Necessary because GlobalAlloc::dealloc requires the layout to be the same, and therefore Box +// cannot be used for increased alignment directly. +pub struct AlignedBox { + inner: *mut T, +} +unsafe impl Send for AlignedBox {} +unsafe impl Sync for AlignedBox {} + +/// # Safety +/// All types implementing this trait must be valid when zeroed +pub unsafe trait ValidForZero {} +unsafe impl ValidForZero for [u8; N] {} +unsafe impl ValidForZero for u8 {} + +impl AlignedBox { + fn layout(&self) -> Layout { + layout_upgrade_align(Layout::for_value::(self), ALIGN) + } +} +const fn layout_upgrade_align(layout: Layout, align: usize) -> Layout { + const fn max(a: usize, b: usize) -> usize { + if a > b { + a + } else { + b + } + } + let Ok(x) = Layout::from_size_align(layout.size(), max(align, layout.align())) else { + panic!("failed to calculate layout"); + }; + x +} + +impl AlignedBox { + #[inline(always)] + pub fn try_zeroed() -> Result + where + T: ValidForZero, + { + Ok(unsafe { + let ptr = + crate::ALLOCATOR.alloc_zeroed(layout_upgrade_align(Layout::new::(), ALIGN)); + if ptr.is_null() { + return Err(Enomem); + } + Self { inner: ptr.cast() } + }) + } +} +impl AlignedBox<[T], ALIGN> { + #[inline] + pub fn try_zeroed_slice(len: usize) -> Result + where + T: ValidForZero, + { + Ok(unsafe { + let ptr = crate::ALLOCATOR.alloc_zeroed(layout_upgrade_align( + Layout::array::(len).unwrap(), + ALIGN, + )); + if ptr.is_null() { + return Err(Enomem); + } + Self { + inner: core::ptr::slice_from_raw_parts_mut(ptr.cast(), len), + } + }) + } +} + +impl core::fmt::Debug for AlignedBox { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "[aligned box at {:p}, size {} alignment {}]", + self.inner, + self.layout().size(), + self.layout().align() + ) + } +} +impl Drop for AlignedBox { + fn drop(&mut self) { + unsafe { + let layout = self.layout(); + core::ptr::drop_in_place(self.inner); + crate::ALLOCATOR.dealloc(self.inner.cast(), layout); + } + } +} +impl core::ops::Deref for AlignedBox { + type Target = T; + + fn deref(&self) -> &Self::Target { + unsafe { &*self.inner } + } +} +impl core::ops::DerefMut for AlignedBox { + fn deref_mut(&mut self) -> &mut Self::Target { + unsafe { &mut *self.inner } + } +} +impl Clone for AlignedBox { + fn clone(&self) -> Self { + let mut new = + Self::try_zeroed().unwrap_or_else(|_| alloc::alloc::handle_alloc_error(self.layout())); + T::clone_from(&mut new, self); + new + } +} +impl Clone for AlignedBox<[T], ALIGN> { + fn clone(&self) -> Self { + let mut new = Self::try_zeroed_slice(self.len()) + .unwrap_or_else(|_| alloc::alloc::handle_alloc_error(self.layout())); + for (i, val) in self.iter().enumerate() { + if let Some(new_inner) = new.get_mut(i) { + new_inner.clone_from(val); + } else { + unreachable!(); + } + } + new + } +} diff --git a/src/common/int_like.rs b/src/common/int_like.rs new file mode 100644 index 0000000000..a427a8d6fc --- /dev/null +++ b/src/common/int_like.rs @@ -0,0 +1,161 @@ +//! Helpers used to define types that are backed by integers (typically `usize`), +//! without compromising safety. +//! +//! # Example +//! +//! ``` +//! /// Define an opaque type `Pid` backed by a `usize`. +//! int_like!(Pid, usize); +//! +//! const ZERO: Pid = Pid::from(0); +//! ``` +//! +//! # Example +//! +//! ``` +//! /// Define opaque types `Pid` and `AtomicPid`, backed respectively by a `usize` +//! /// and a `AtomicUsize`. +//! +//! int_like!(Pid, AtomicPid, usize, AtomicUsize); +//! +//! const ZERO: Pid = Pid::from(0); +//! let ATOMIC_PID: AtomicPid = AtomicPid::default(); +//! ``` + +#[macro_export] +macro_rules! int_like { + ($new_type_name:ident, $backing_type: ident) => { + #[derive(Default, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Clone, Copy)] + pub struct $new_type_name($backing_type); + + impl $new_type_name { + #[allow(dead_code)] + #[inline] + pub const fn get(self) -> $backing_type { + self.0 + } + #[allow(dead_code)] + #[inline] + pub const fn new(x: $backing_type) -> Self { + $new_type_name(x) + } + } + + impl ::core::convert::From<$backing_type> for $new_type_name { + #[inline] + fn from(inner: $backing_type) -> Self { + Self::new(inner) + } + } + impl ::core::convert::From<$new_type_name> for $backing_type { + #[inline] + fn from(wrapped: $new_type_name) -> Self { + wrapped.get() + } + } + }; + + ($new_type_name:ident, $new_atomic_type_name: ident, $backing_type:ident, $backing_atomic_type:ident) => { + int_like!($new_type_name, $backing_type); + + /// A mutable holder for T that can safely be shared among threads. + /// Runtime equivalent to using `AtomicUsize`, just type-safer. + pub struct $new_atomic_type_name { + container: $backing_atomic_type, + } + + impl $new_atomic_type_name { + #[allow(dead_code)] + #[inline] + pub const fn new(x: $new_type_name) -> Self { + $new_atomic_type_name { + container: $backing_atomic_type::new(x.get()), + } + } + #[allow(dead_code)] + #[inline] + pub fn load(&self, order: ::core::sync::atomic::Ordering) -> $new_type_name { + $new_type_name::from(self.container.load(order)) + } + #[allow(dead_code)] + #[inline] + pub fn store(&self, val: $new_type_name, order: ::core::sync::atomic::Ordering) { + self.container.store(val.into(), order) + } + #[allow(dead_code)] + #[inline] + pub fn swap( + &self, + val: $new_type_name, + order: ::core::sync::atomic::Ordering, + ) -> $new_type_name { + $new_type_name::from(self.container.swap(val.into(), order)) + } + #[allow(dead_code)] + #[inline] + pub fn fetch_add( + &self, + with: $new_type_name, + order: ::core::sync::atomic::Ordering, + ) -> $new_type_name { + $new_type_name::from(self.container.fetch_add(with.into(), order)) + } + #[allow(dead_code)] + #[inline] + pub fn compare_exchange( + &self, + current: $new_type_name, + new: $new_type_name, + success: ::core::sync::atomic::Ordering, + failure: ::core::sync::atomic::Ordering, + ) -> ::core::result::Result<$new_type_name, $new_type_name> { + match self + .container + .compare_exchange(current.into(), new.into(), success, failure) + { + Ok(result) => Ok($new_type_name::from(result)), + Err(result) => Err($new_type_name::from(result)), + } + } + #[allow(dead_code)] + #[inline] + pub fn compare_exchange_weak( + &self, + current: $new_type_name, + new: $new_type_name, + success: ::core::sync::atomic::Ordering, + failure: ::core::sync::atomic::Ordering, + ) -> ::core::result::Result<$new_type_name, $new_type_name> { + match self.container.compare_exchange_weak( + current.into(), + new.into(), + success, + failure, + ) { + Ok(result) => Ok($new_type_name::from(result)), + Err(result) => Err($new_type_name::from(result)), + } + } + } + impl ::core::default::Default for $new_atomic_type_name { + #[inline] + fn default() -> Self { + Self::new($new_type_name::new(0)) + } + } + }; +} + +#[test] +fn test() { + use ::core::sync::atomic::AtomicUsize; + + // Generate type `usize_like`. + int_like!(UsizeLike, usize); + assert_eq!(size_of::(), size_of::()); + + // Generate types `usize_like` and `AtomicUsize`. + int_like!(UsizeLike2, AtomicUsizeLike, usize, AtomicUsize); + assert_eq!(size_of::(), size_of::()); + assert_eq!(size_of::(), size_of::()); +} diff --git a/src/common/mod.rs b/src/common/mod.rs new file mode 100644 index 0000000000..64a5eebde6 --- /dev/null +++ b/src/common/mod.rs @@ -0,0 +1,27 @@ +pub mod aligned_box; +#[macro_use] +pub mod int_like; + +/// Debug macro, lifted from the std +#[macro_export] +macro_rules! dbg { + () => { + $crate::println!("[{}:{}]", file!(), line!()); + }; + ($val:expr_2021) => { + // Use of `match` here is intentional because it affects the lifetimes + // of temporaries - https://stackoverflow.com/a/48732525/1063961 + match $val { + tmp => { + $crate::println!("[{}:{}] {} = {:#?}", + file!(), line!(), stringify!($val), &tmp); + tmp + } + } + }; + // Trailing comma with single argument is ignored + ($val:expr_2021,) => { $crate::dbg!($val) }; + ($($val:expr_2021),+ $(,)?) => { + ($($crate::dbg!($val)),+,) + }; +} diff --git a/src/context/arch/aarch64.rs b/src/context/arch/aarch64.rs new file mode 100644 index 0000000000..33dc83a987 --- /dev/null +++ b/src/context/arch/aarch64.rs @@ -0,0 +1,391 @@ +use crate::{ + arch::{device::cpu::registers::control_regs, interrupt::InterruptStack}, + context::context::Kstack, + percpu::PercpuBlock, + syscall::FloatRegisters, +}; +use core::{mem::offset_of, ptr, sync::atomic::AtomicBool}; +use spin::Once; +use syscall::{EnvRegisters, Result}; + +/// This must be used by the kernel to ensure that context switches are done atomically +/// Compare and exchange this to true when beginning a context switch on any CPU +/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch +/// This must be done, as no locks can be held on the stack during switch +pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); + +// 512 bytes for registers, extra bytes for fpcr and fpsr +pub const KFX_ALIGN: usize = 16; + +#[derive(Clone, Debug)] +pub struct Context { + elr_el1: usize, + sp_el0: usize, + pub(crate) tpidr_el0: usize, /* Pointer to TLS region for this Context */ + pub(crate) tpidrro_el0: usize, /* Pointer to TLS (read-only) region for this Context */ + spsr_el1: usize, + esr_el1: usize, + fx_loadable: bool, + sp: usize, /* Stack Pointer (x31) */ + lr: usize, /* Link Register (x30) */ + fp: usize, /* Frame pointer Register (x29) */ + x28: usize, /* Callee saved Register */ + x27: usize, /* Callee saved Register */ + x26: usize, /* Callee saved Register */ + x25: usize, /* Callee saved Register */ + x24: usize, /* Callee saved Register */ + x23: usize, /* Callee saved Register */ + x22: usize, /* Callee saved Register */ + x21: usize, /* Callee saved Register */ + x20: usize, /* Callee saved Register */ + x19: usize, /* Callee saved Register */ +} + +impl Context { + pub fn new() -> Context { + Context { + elr_el1: 0, + sp_el0: 0, + tpidr_el0: 0, + tpidrro_el0: 0, + spsr_el1: 0, + esr_el1: 0, + fx_loadable: false, + sp: 0, + lr: 0, + fp: 0, + x28: 0, + x27: 0, + x26: 0, + x25: 0, + x24: 0, + x23: 0, + x22: 0, + x21: 0, + x20: 0, + x19: 0, + } + } + + fn set_stack(&mut self, address: usize) { + self.sp = address; + } + + fn set_x28(&mut self, x28: usize) { + self.x28 = x28; + } + + fn set_lr(&mut self, address: usize) { + self.lr = address; + } + + fn set_context_handle(&mut self) { + let address = self as *const _ as usize; + self.tpidrro_el0 = address; + } + + pub(crate) fn setup_initial_call( + &mut self, + stack: &Kstack, + func: extern "C" fn(), + userspace_allowed: bool, + ) { + let mut stack_top = stack.initial_top(); + + const INT_REGS_SIZE: usize = size_of::(); + + if userspace_allowed { + unsafe { + // Zero-initialize InterruptStack registers. + stack_top = stack_top.sub(INT_REGS_SIZE); + stack_top.write_bytes(0_u8, INT_REGS_SIZE); + (&mut *stack_top.cast::()).init(); + } + } + + self.set_lr(crate::arch::interrupt::syscall::enter_usermode as usize); + self.set_x28(func as usize); + self.set_context_handle(); + + self.set_stack(stack_top as usize); + } + + #[allow(unused)] + pub fn dump(&self) { + println!("elr_el1: 0x{:016x}", self.elr_el1); + println!("sp_el0: 0x{:016x}", self.sp_el0); + println!("tpidr_el0: 0x{:016x}", self.tpidr_el0); + println!("tpidrro_el0: 0x{:016x}", self.tpidrro_el0); + println!("spsr_el1: 0x{:016x}", self.spsr_el1); + println!("esr_el1: 0x{:016x}", self.esr_el1); + println!("sp: 0x{:016x}", self.sp); + println!("lr: 0x{:016x}", self.lr); + println!("fp: 0x{:016x}", self.fp); + println!("x28: 0x{:016x}", self.x28); + println!("x27: 0x{:016x}", self.x27); + println!("x26: 0x{:016x}", self.x26); + println!("x25: 0x{:016x}", self.x25); + println!("x24: 0x{:016x}", self.x24); + println!("x23: 0x{:016x}", self.x23); + println!("x22: 0x{:016x}", self.x22); + println!("x21: 0x{:016x}", self.x21); + println!("x20: 0x{:016x}", self.x20); + println!("x19: 0x{:016x}", self.x19); + } +} + +impl super::Context { + pub fn get_fx_regs(&self) -> FloatRegisters { + if !self.arch.fx_loadable { + panic!("TODO: make get_fx_regs always work"); + } + + unsafe { ptr::read(self.kfx.as_ptr() as *const FloatRegisters) } + } + + pub fn set_fx_regs(&mut self, new: FloatRegisters) { + if !self.arch.fx_loadable { + panic!("TODO: make set_fx_regs always work"); + } + + unsafe { + ptr::write(self.kfx.as_mut_ptr() as *mut FloatRegisters, new); + } + } + pub fn current_syscall(&self) -> Option<[usize; 7]> { + if !self.inside_syscall { + return None; + } + let regs = self.regs()?; + let scratch = ®s.scratch; + Some([ + scratch.x8, scratch.x0, scratch.x1, scratch.x2, scratch.x3, scratch.x4, scratch.x5, + ]) + } + + pub(crate) fn write_current_env_regs(&self, regs: EnvRegisters) -> Result<()> { + unsafe { + control_regs::tpidr_el0_write(regs.tpidr_el0 as u64); + control_regs::tpidrro_el0_write(regs.tpidrro_el0 as u64); + } + Ok(()) + } + + pub(crate) fn write_env_regs(&mut self, regs: EnvRegisters) -> Result<()> { + self.arch.tpidr_el0 = regs.tpidr_el0; + self.arch.tpidrro_el0 = regs.tpidrro_el0; + Ok(()) + } + + pub(crate) fn read_current_env_regs(&self) -> Result { + unsafe { + Ok(EnvRegisters { + tpidr_el0: control_regs::tpidr_el0() as usize, + tpidrro_el0: control_regs::tpidrro_el0() as usize, + }) + } + } + + pub(crate) fn read_env_regs(&self) -> Result { + Ok(EnvRegisters { + tpidr_el0: self.arch.tpidr_el0, + tpidrro_el0: self.arch.tpidrro_el0, + }) + } + pub fn set_userspace_io_allowed(&mut self, _allowed: bool) {} +} + +pub static EMPTY_CR3: Once = Once::new(); + +// SAFETY: EMPTY_CR3 must be initialized. +pub unsafe fn empty_cr3() -> rmm::PhysicalAddress { + unsafe { + debug_assert!(EMPTY_CR3.poll().is_some()); + *EMPTY_CR3.get_unchecked() + } +} + +#[target_feature(enable = "neon")] +unsafe extern "C" fn fp_save(float_regs: &mut FloatRegisters) { + unsafe { + core::arch::asm!( + "stp q0, q1, [{3}, {0} + 16 * 0]", + "stp q2, q3, [{3}, {0} + 16 * 2]", + "stp q4, q5, [{3}, {0} + 16 * 4]", + "stp q6, q7, [{3}, {0} + 16 * 6]", + "stp q8, q9, [{3}, {0} + 16 * 8]", + "stp q10, q11, [{3}, {0} + 16 * 10]", + "stp q12, q13, [{3}, {0} + 16 * 12]", + "stp q14, q15, [{3}, {0} + 16 * 14]", + "stp q16, q17, [{3}, {0} + 16 * 16]", + "stp q18, q19, [{3}, {0} + 16 * 18]", + "stp q20, q21, [{3}, {0} + 16 * 20]", + "stp q22, q23, [{3}, {0} + 16 * 22]", + "stp q24, q25, [{3}, {0} + 16 * 24]", + "stp q26, q27, [{3}, {0} + 16 * 26]", + "stp q28, q29, [{3}, {0} + 16 * 28]", + "stp q30, q31, [{3}, {0} + 16 * 30]", + "mrs x9, fpcr", + "add {3}, {3}, {1}", + "str x9, [{3}]", + "mrs x9, fpsr", + "str x9, [{3}, {2} - {1}]", + const offset_of!(FloatRegisters, fp_simd_regs), + const offset_of!(FloatRegisters, fpcr), + const offset_of!(FloatRegisters, fpsr), + inout(reg) float_regs => _, + ); + } +} + +#[target_feature(enable = "neon")] +unsafe extern "C" fn fp_load(float_regs: &mut FloatRegisters) { + unsafe { + core::arch::asm!( + "ldp q0, q1, [{3}, {0} + 16 * 0]", + "ldp q2, q3, [{3}, {0} + 16 * 2]", + "ldp q4, q5, [{3}, {0} + 16 * 4]", + "ldp q6, q7, [{3}, {0} + 16 * 6]", + "ldp q8, q9, [{3}, {0} + 16 * 8]", + "ldp q10, q11, [{3}, {0} + 16 * 10]", + "ldp q12, q13, [{3}, {0} + 16 * 12]", + "ldp q14, q15, [{3}, {0} + 16 * 14]", + "ldp q16, q17, [{3}, {0} + 16 * 16]", + "ldp q18, q19, [{3}, {0} + 16 * 18]", + "ldp q20, q21, [{3}, {0} + 16 * 20]", + "ldp q22, q23, [{3}, {0} + 16 * 22]", + "ldp q24, q25, [{3}, {0} + 16 * 24]", + "ldp q26, q27, [{3}, {0} + 16 * 26]", + "ldp q28, q29, [{3}, {0} + 16 * 28]", + "ldp q30, q31, [{3}, {0} + 16 * 30]", + "add {3}, {3}, {1}", + "ldr x9, [{3}]", + "msr fpcr, x9", + "ldr x9, [{3}, {2} - {1}]", + "msr fpsr, x9", + const offset_of!(FloatRegisters, fp_simd_regs), + const offset_of!(FloatRegisters, fpcr), + const offset_of!(FloatRegisters, fpsr), + inout(reg) float_regs => _, + ); + } +} + +pub unsafe fn switch_to(prev: &mut super::Context, next: &mut super::Context) { + unsafe { + fp_save(&mut *(prev.kfx.as_mut_ptr() as *mut FloatRegisters)); + + prev.arch.fx_loadable = true; + + if next.arch.fx_loadable { + fp_load(&mut *(next.kfx.as_mut_ptr() as *mut FloatRegisters)); + } + + PercpuBlock::current() + .new_addrsp_tmp + .set(next.addr_space.clone()); + + switch_to_inner(&mut prev.arch, &mut next.arch) + } +} + +#[unsafe(naked)] +unsafe extern "C" fn switch_to_inner(_prev: &mut Context, _next: &mut Context) { + core::arch::naked_asm!( + " + str x19, [x0, #{off_x19}] + ldr x19, [x1, #{off_x19}] + + str x20, [x0, #{off_x20}] + ldr x20, [x1, #{off_x20}] + + str x21, [x0, #{off_x21}] + ldr x21, [x1, #{off_x21}] + + str x22, [x0, #{off_x22}] + ldr x22, [x1, #{off_x22}] + + str x23, [x0, #{off_x23}] + ldr x23, [x1, #{off_x23}] + + str x24, [x0, #{off_x24}] + ldr x24, [x1, #{off_x24}] + + str x25, [x0, #{off_x25}] + ldr x25, [x1, #{off_x25}] + + str x26, [x0, #{off_x26}] + ldr x26, [x1, #{off_x26}] + + str x27, [x0, #{off_x27}] + ldr x27, [x1, #{off_x27}] + + str x28, [x0, #{off_x28}] + ldr x28, [x1, #{off_x28}] + + str x29, [x0, #{off_x29}] + ldr x29, [x1, #{off_x29}] + + str x30, [x0, #{off_x30}] + ldr x30, [x1, #{off_x30}] + + mrs x2, elr_el1 + str x2, [x0, #{off_elr_el1}] + ldr x2, [x1, #{off_elr_el1}] + msr elr_el1, x2 + + mrs x2, sp_el0 + str x2, [x0, #{off_sp_el0}] + ldr x2, [x1, #{off_sp_el0}] + msr sp_el0, x2 + + mrs x2, tpidr_el0 + str x2, [x0, #{off_tpidr_el0}] + ldr x2, [x1, #{off_tpidr_el0}] + msr tpidr_el0, x2 + + mrs x2, tpidrro_el0 + str x2, [x0, #{off_tpidrro_el0}] + ldr x2, [x1, #{off_tpidrro_el0}] + msr tpidrro_el0, x2 + + mrs x2, spsr_el1 + str x2, [x0, #{off_spsr_el1}] + ldr x2, [x1, #{off_spsr_el1}] + msr spsr_el1, x2 + + mrs x2, esr_el1 + str x2, [x0, #{off_esr_el1}] + ldr x2, [x1, #{off_esr_el1}] + msr esr_el1, x2 + + mov x2, sp + str x2, [x0, #{off_sp}] + ldr x2, [x1, #{off_sp}] + mov sp, x2 + + b {switch_hook} + ", + off_x19 = const(offset_of!(Context, x19)), + off_x20 = const(offset_of!(Context, x20)), + off_x21 = const(offset_of!(Context, x21)), + off_x22 = const(offset_of!(Context, x22)), + off_x23 = const(offset_of!(Context, x23)), + off_x24 = const(offset_of!(Context, x24)), + off_x25 = const(offset_of!(Context, x25)), + off_x26 = const(offset_of!(Context, x26)), + off_x27 = const(offset_of!(Context, x27)), + off_x28 = const(offset_of!(Context, x28)), + off_x29 = const(offset_of!(Context, fp)), + off_x30 = const(offset_of!(Context, lr)), + off_elr_el1 = const(offset_of!(Context, elr_el1)), + off_sp_el0 = const(offset_of!(Context, sp_el0)), + off_tpidr_el0 = const(offset_of!(Context, tpidr_el0)), + off_tpidrro_el0 = const(offset_of!(Context, tpidrro_el0)), + off_spsr_el1 = const(offset_of!(Context, spsr_el1)), + off_esr_el1 = const(offset_of!(Context, esr_el1)), + off_sp = const(offset_of!(Context, sp)), + + switch_hook = sym crate::context::switch_finish_hook, + ); +} diff --git a/src/context/arch/riscv64.rs b/src/context/arch/riscv64.rs new file mode 100644 index 0000000000..4bd843e620 --- /dev/null +++ b/src/context/arch/riscv64.rs @@ -0,0 +1,224 @@ +use crate::{ + arch::interrupt::InterruptStack, context::context::Kstack, memory::RmmA, percpu::PercpuBlock, + syscall::FloatRegisters, +}; +use core::{mem::offset_of, sync::atomic::AtomicBool}; +use rmm::{Arch, VirtualAddress}; +use spin::Once; +use syscall::{error::*, EnvRegisters}; + +pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); + +pub const KFX_ALIGN: usize = 16; + +#[derive(Clone, Debug, Default)] +pub struct Context { + sp: usize, + ra: usize, + fp: usize, + s1: usize, + s2: usize, + s3: usize, + s4: usize, + s5: usize, + s6: usize, + s7: usize, + s8: usize, + s9: usize, + s10: usize, + s11: usize, + sstatus: usize, +} + +impl Context { + pub fn new() -> Self { + Self::default() + } + + fn set_stack(&mut self, address: usize) { + self.sp = address; + } + + fn set_ra(&mut self, address: usize) { + self.ra = address; + } + + fn set_s11(&mut self, address: usize) { + self.s11 = address; + } + + pub(crate) fn setup_initial_call( + &mut self, + stack: &Kstack, + func: extern "C" fn(), + userspace_allowed: bool, + ) { + let mut stack_top = stack.initial_top(); + + const INT_REGS_SIZE: usize = size_of::(); + + if userspace_allowed { + unsafe { + // Zero-initialize InterruptStack registers. + stack_top = stack_top.sub(INT_REGS_SIZE); + stack_top.write_bytes(0_u8, INT_REGS_SIZE); + (&mut *stack_top.cast::()).init(); + } + } + + self.set_ra(crate::arch::interrupt::syscall::enter_usermode as usize); + self.set_s11(func as usize); + + self.set_stack(stack_top as usize); + } +} + +impl super::Context { + pub fn get_fx_regs(&self) -> FloatRegisters { + unimplemented!() + } + + pub fn set_fx_regs(&mut self, mut _new: FloatRegisters) { + unimplemented!() + } + + pub fn current_syscall(&self) -> Option<[usize; 7]> { + if !self.inside_syscall { + return None; + } + let regs = self.regs()?; + let regs = ®s.registers; + Some([ + regs.x17, regs.x10, regs.x11, regs.x12, regs.x13, regs.x14, regs.x15, + ]) + } + + pub(crate) fn write_current_env_regs(&mut self, regs: EnvRegisters) -> Result<()> { + self.write_env_regs(regs) + } + + pub(crate) fn write_env_regs(&mut self, regs: EnvRegisters) -> Result<()> { + if RmmA::virt_is_valid(VirtualAddress::new(regs.tp)) { + match self.regs_mut() { + Some(stack) => { + stack.registers.x4 = regs.tp; + Ok(()) + } + None => Err(Error::new(ESRCH)), + } + } else { + Err(Error::new(EINVAL)) + } + } + + pub(crate) fn read_current_env_regs(&self) -> Result { + self.read_env_regs() + } + + pub(crate) fn read_env_regs(&self) -> Result { + match self.regs() { + Some(stack) => Ok(EnvRegisters { + tp: stack.registers.x4, + }), + None => Err(Error::new(ESRCH)), + } + } + pub fn set_userspace_io_allowed(&mut self, _allowed: bool) {} +} + +pub static EMPTY_CR3: Once = Once::new(); + +// SAFETY: EMPTY_CR3 must be initialized. +pub unsafe fn empty_cr3() -> rmm::PhysicalAddress { + unsafe { + debug_assert!(EMPTY_CR3.poll().is_some()); + *EMPTY_CR3.get_unchecked() + } +} + +/// Switch to the next context by restoring its stack and registers +pub unsafe fn switch_to(prev: &mut super::Context, next: &mut super::Context) { + unsafe { + // FIXME floating point + PercpuBlock::current() + .new_addrsp_tmp + .set(next.addr_space.clone()); + + switch_to_inner(&mut prev.arch, &mut next.arch); + } +} + +#[unsafe(naked)] +unsafe extern "C" fn switch_to_inner(prev: &mut Context, next: &mut Context) { + core::arch::naked_asm!(r#" + sd s1, {off_s1}(a0) + ld s1, {off_s1}(a1) + + sd s2, {off_s2}(a0) + ld s2, {off_s2}(a1) + + sd s3, {off_s3}(a0) + ld s3, {off_s3}(a1) + + sd s4, {off_s4}(a0) + ld s4, {off_s4}(a1) + + sd s5, {off_s5}(a0) + ld s5, {off_s5}(a1) + + sd s6, {off_s6}(a0) + ld s6, {off_s6}(a1) + + sd s7, {off_s7}(a0) + ld s7, {off_s7}(a1) + + sd s8, {off_s8}(a0) + ld s8, {off_s8}(a1) + + sd s9, {off_s9}(a0) + ld s9, {off_s9}(a1) + + sd s10, {off_s10}(a0) + ld s10, {off_s10}(a1) + + sd s11, {off_s11}(a0) + ld s11, {off_s11}(a1) + + sd s11, {off_s11}(a0) + ld s11, {off_s11}(a1) + + sd sp, {off_sp}(a0) + ld sp, {off_sp}(a1) + + sd ra, {off_ra}(a0) + ld ra, {off_ra}(a1) + + sd fp, {off_fp}(a0) + ld fp, {off_fp}(a1) + + csrr t0, sstatus + sd t0, {off_sstatus}(a0) + ld t0, {off_sstatus}(a1) + csrw sstatus, t0 + + j {switch_hook} + "#, + off_s1 = const(offset_of!(Context, s1)), + off_s2 = const(offset_of!(Context, s2)), + off_s3 = const(offset_of!(Context, s3)), + off_s4 = const(offset_of!(Context, s4)), + off_s5 = const(offset_of!(Context, s5)), + off_s6 = const(offset_of!(Context, s6)), + off_s7 = const(offset_of!(Context, s7)), + off_s8 = const(offset_of!(Context, s8)), + off_s9 = const(offset_of!(Context, s9)), + off_s10 = const(offset_of!(Context, s10)), + off_s11 = const(offset_of!(Context, s11)), + off_sp = const(offset_of!(Context, sp)), + off_ra = const(offset_of!(Context, ra)), + off_fp = const(offset_of!(Context, fp)), + off_sstatus = const(offset_of!(Context, sstatus)), + + switch_hook = sym crate::context::switch_finish_hook, + ); +} diff --git a/src/context/arch/x86.rs b/src/context/arch/x86.rs new file mode 100644 index 0000000000..2862d35f20 --- /dev/null +++ b/src/context/arch/x86.rs @@ -0,0 +1,315 @@ +use core::{mem::offset_of, sync::atomic::AtomicBool}; +use rmm::{Arch, VirtualAddress}; +use spin::Once; +use syscall::{error::*, EnvRegisters}; + +use crate::{ + arch::{ + gdt::{pcr, GDT_USER_FS, GDT_USER_GS}, + interrupt::{self, InterruptStack}, + }, + context::context::Kstack, + memory::RmmA, + percpu::PercpuBlock, + syscall::FloatRegisters, +}; + +/// This must be used by the kernel to ensure that context switches are done atomically +/// Compare and exchange this to true when beginning a context switch on any CPU +/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch +/// This must be done, as no locks can be held on the stack during switch +pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); + +const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000; + +pub const KFX_ALIGN: usize = 16; + +#[derive(Clone, Debug)] +#[repr(C)] +pub struct Context { + /// EFLAGS register + eflags: usize, + /// EBX register + ebx: usize, + /// EDI register + edi: usize, + /// ESI register + esi: usize, + /// Base pointer + ebp: usize, + /// Stack pointer + pub(crate) esp: usize, + /// FSBASE. + /// + /// NOTE: Same fsgsbase behavior as with gsbase. + pub(crate) fsbase: usize, + /// GSBASE. + /// + /// NOTE: Without fsgsbase, this register will strictly be equal to the register value when + /// running. With fsgsbase, this is neither saved nor restored upon every syscall (there is no + /// need to!), and thus it must be re-read from the register before copying this struct. + pub(crate) gsbase: usize, + userspace_io_allowed: bool, +} + +impl Context { + pub fn new() -> Context { + Context { + eflags: 0, + ebx: 0, + edi: 0, + esi: 0, + ebp: 0, + esp: 0, + fsbase: 0, + gsbase: 0, + userspace_io_allowed: false, + } + } + + fn set_stack(&mut self, address: usize) { + self.esp = address; + } + + pub(crate) fn setup_initial_call( + &mut self, + stack: &Kstack, + func: extern "C" fn(), + userspace_allowed: bool, + ) { + let mut stack_top = stack.initial_top(); + + const INT_REGS_SIZE: usize = size_of::(); + + unsafe { + if userspace_allowed { + // Zero-initialize InterruptStack registers. + stack_top = stack_top.sub(INT_REGS_SIZE); + stack_top.write_bytes(0_u8, INT_REGS_SIZE); + (&mut *stack_top.cast::()).init(); + + stack_top = stack_top.sub(size_of::()); + stack_top + .cast::() + .write(interrupt::syscall::enter_usermode as usize); + } + + stack_top = stack_top.sub(size_of::()); + stack_top.cast::().write(func as usize); + } + + self.set_stack(stack_top as usize); + } +} + +impl super::Context { + pub fn get_fx_regs(&self) -> FloatRegisters { + let mut regs = unsafe { self.kfx.as_ptr().cast::().read() }; + regs._reserved = 0; + let mut new_st = regs.st_space; + for st in &mut new_st { + // Only allow access to the 80 lowest bits + *st &= !ST_RESERVED; + } + regs.st_space = new_st; + regs + } + + pub fn set_fx_regs(&mut self, mut new: FloatRegisters) { + { + let old = unsafe { &*(self.kfx.as_ptr().cast::()) }; + new._reserved = old._reserved; + let old_st = new.st_space; + let mut new_st = new.st_space; + for (new_st, old_st) in new_st.iter_mut().zip(&old_st) { + *new_st &= !ST_RESERVED; + *new_st |= old_st & ST_RESERVED; + } + new.st_space = new_st; + + // Make sure we don't use `old` from now on + } + + unsafe { + self.kfx.as_mut_ptr().cast::().write(new); + } + } + pub fn set_userspace_io_allowed(&mut self, allowed: bool) { + self.arch.userspace_io_allowed = allowed; + + if self.is_current_context() { + unsafe { + crate::arch::gdt::set_userspace_io_allowed(crate::arch::gdt::pcr(), allowed); + } + } + } + pub fn current_syscall(&self) -> Option<[usize; 7]> { + if !self.inside_syscall { + return None; + } + let regs = self.regs()?; + Some([ + regs.scratch.eax, + regs.preserved.ebx, + regs.scratch.ecx, + regs.scratch.edx, + regs.preserved.esi, + regs.preserved.edi, + regs.preserved.ebp, + ]) + } + + pub(crate) fn write_current_env_regs(&mut self, regs: EnvRegisters) -> Result<()> { + if RmmA::virt_is_valid(VirtualAddress::new(regs.fsbase as usize)) + && RmmA::virt_is_valid(VirtualAddress::new(regs.gsbase as usize)) + { + unsafe { + (&mut *pcr()).gdt[GDT_USER_FS].set_offset(regs.fsbase); + (&mut *pcr()).gdt[GDT_USER_GS].set_offset(regs.gsbase); + } + self.arch.fsbase = regs.fsbase as usize; + self.arch.gsbase = regs.gsbase as usize; + Ok(()) + } else { + Err(Error::new(EINVAL)) + } + } + + pub(crate) fn write_env_regs(&mut self, regs: EnvRegisters) -> Result<()> { + if RmmA::virt_is_valid(VirtualAddress::new(regs.fsbase as usize)) + && RmmA::virt_is_valid(VirtualAddress::new(regs.gsbase as usize)) + { + self.arch.fsbase = regs.fsbase as usize; + self.arch.gsbase = regs.gsbase as usize; + Ok(()) + } else { + Err(Error::new(EINVAL)) + } + } + + pub(crate) fn read_current_env_regs(&self) -> Result { + unsafe { + Ok(EnvRegisters { + fsbase: (&*pcr()).gdt[GDT_USER_FS].offset(), + gsbase: (&*pcr()).gdt[GDT_USER_GS].offset(), + }) + } + } + + pub(crate) fn read_env_regs(&self) -> Result { + Ok(EnvRegisters { + fsbase: self.arch.fsbase as u32, + gsbase: self.arch.gsbase as u32, + }) + } +} + +pub static EMPTY_CR3: Once = Once::new(); + +// SAFETY: EMPTY_CR3 must be initialized. +pub unsafe fn empty_cr3() -> rmm::PhysicalAddress { + unsafe { + debug_assert!(EMPTY_CR3.poll().is_some()); + *EMPTY_CR3.get_unchecked() + } +} + +/// Switch to the next context by restoring its stack and registers +pub unsafe fn switch_to(prev: &mut super::Context, next: &mut super::Context) { + unsafe { + let pcr = crate::arch::gdt::pcr(); + + if let Some(ref stack) = next.kstack { + crate::arch::gdt::set_tss_stack(pcr, stack.initial_top() as usize); + } + crate::arch::gdt::set_userspace_io_allowed(pcr, next.arch.userspace_io_allowed); + + core::arch::asm!(" + fxsave [{prev_fx}] + fxrstor [{next_fx}] + ", prev_fx = in(reg) prev.kfx.as_mut_ptr(), + next_fx = in(reg) next.kfx.as_ptr(), + ); + + { + let gdt = &mut (*pcr).gdt; + + prev.arch.fsbase = gdt[GDT_USER_FS].offset() as usize; + gdt[GDT_USER_FS].set_offset(next.arch.fsbase as u32); + prev.arch.gsbase = gdt[GDT_USER_GS].offset() as usize; + gdt[GDT_USER_GS].set_offset(next.arch.gsbase as u32); + } + PercpuBlock::current() + .new_addrsp_tmp + .set(next.addr_space.clone()); + + core::arch::asm!( + "call {inner}", + inner = sym switch_to_inner, + in("ecx") &mut prev.arch, + in("edx") &mut next.arch, + ); + } +} + +// Check disassembly! +#[unsafe(naked)] +unsafe extern "cdecl" fn switch_to_inner() { + use Context as Cx; + + core::arch::naked_asm!( + // As a quick reminder for those who are unfamiliar with the System V ABI (extern "C"): + // + // - the current parameters are passed in the registers `edi`, `esi`, + // - we can modify scratch registers, e.g. rax + // - we cannot change callee-preserved registers arbitrarily, e.g. ebx, which is why we + // store them here in the first place. + " + // ecx is prev, edx is next + + // Save old registers, and load new ones + mov [ecx + {off_ebx}], ebx + mov ebx, [edx + {off_ebx}] + + mov [ecx + {off_edi}], edi + mov edi, [edx + {off_edi}] + + mov [ecx + {off_esi}], esi + mov esi, [edx + {off_esi}] + + mov [ecx + {off_ebp}], ebp + mov ebp, [edx + {off_ebp}] + + mov [ecx + {off_esp}], esp + mov esp, [edx + {off_esp}] + + // push EFLAGS (can only be modified via stack) + pushfd + // pop EFLAGS into `self.eflags` + pop DWORD PTR [ecx + {off_eflags}] + + // push `next.eflags` + push DWORD PTR [edx + {off_eflags}] + // pop into EFLAGS + popfd + + // When we return, we cannot even guarantee that the return address on the stack, points to + // the calling function, `context::switch`. Thus, we have to execute this Rust hook by + // ourselves, which will unlock the contexts before the later switch. + + // Note that switch_finish_hook will be responsible for executing `ret`. + jmp {switch_hook} + + ", + + off_eflags = const(offset_of!(Cx, eflags)), + + off_ebx = const(offset_of!(Cx, ebx)), + off_edi = const(offset_of!(Cx, edi)), + off_esi = const(offset_of!(Cx, esi)), + off_ebp = const(offset_of!(Cx, ebp)), + off_esp = const(offset_of!(Cx, esp)), + + switch_hook = sym crate::context::switch_finish_hook, + ); +} diff --git a/src/context/arch/x86_64.rs b/src/context/arch/x86_64.rs new file mode 100644 index 0000000000..6758c9fca5 --- /dev/null +++ b/src/context/arch/x86_64.rs @@ -0,0 +1,395 @@ +use core::{ + ptr::{addr_of, addr_of_mut}, + sync::atomic::AtomicBool, +}; + +use crate::syscall::FloatRegisters; + +use crate::{arch::interrupt::InterruptStack, context::context::Kstack, memory::RmmA}; +use core::mem::offset_of; +use rmm::{Arch, VirtualAddress}; +use spin::Once; +use syscall::{error::*, EnvRegisters}; +use x86::msr; + +/// This must be used by the kernel to ensure that context switches are done atomically +/// Compare and exchange this to true when beginning a context switch on any CPU +/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch +/// This must be done, as no locks can be held on the stack during switch +pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); + +const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000; + +#[cfg(cpu_feature_never = "xsave")] +pub const KFX_ALIGN: usize = 16; + +#[cfg(not(cpu_feature_never = "xsave"))] +pub const KFX_ALIGN: usize = 64; + +// TODO: stack guarding? + +#[derive(Clone, Debug)] +#[repr(C)] +pub struct Context { + /// RFLAGS register + rflags: usize, + /// RBX register + rbx: usize, + /// R12 register + r12: usize, + /// R13 register + r13: usize, + /// R14 register + r14: usize, + /// R15 register + r15: usize, + /// Base pointer + rbp: usize, + /// Stack pointer + pub(crate) rsp: usize, + /// FSBASE. + /// + /// NOTE: Same fsgsbase behavior as with gsbase. + pub(crate) fsbase: usize, + /// GSBASE. + /// + /// NOTE: Without fsgsbase, this register will strictly be equal to the register value when + /// running. With fsgsbase, this is neither saved nor restored upon every syscall (there is no + /// need to!), and thus it must be re-read from the register before copying this struct. + pub(crate) gsbase: usize, + userspace_io_allowed: bool, +} + +impl Context { + pub fn new() -> Context { + Context { + rflags: 0, + rbx: 0, + r12: 0, + r13: 0, + r14: 0, + r15: 0, + rbp: 0, + rsp: 0, + fsbase: 0, + gsbase: 0, + userspace_io_allowed: false, + } + } + + fn set_stack(&mut self, address: usize) { + self.rsp = address; + } + + pub(crate) fn setup_initial_call( + &mut self, + stack: &Kstack, + func: extern "C" fn(), + userspace_allowed: bool, + ) { + let mut stack_top = stack.initial_top(); + + const INT_REGS_SIZE: usize = size_of::(); + + // Kstack::initial_top() is always at least 8 byte aligned. assertion to be safe + debug_assert!( + (stack_top as usize).is_multiple_of(8), + "Kstack not 8 byte aligned" + ); + #[expect(clippy::cast_ptr_alignment)] + unsafe { + if userspace_allowed { + // Zero-initialize InterruptStack registers. + stack_top = stack_top.sub(INT_REGS_SIZE); + stack_top.write_bytes(0_u8, INT_REGS_SIZE); + (*stack_top.cast::()).init(); + + stack_top = stack_top.sub(size_of::()); + stack_top + .cast::() + .write(crate::arch::interrupt::syscall::enter_usermode as usize); + } + + stack_top = stack_top.sub(size_of::()); + stack_top.cast::().write(func as usize); + } + + self.set_stack(stack_top as usize); + } +} +impl super::Context { + pub fn get_fx_regs(&self) -> FloatRegisters { + let mut regs = unsafe { self.kfx.as_ptr().cast::().read() }; + regs._reserved = 0; + let mut new_st = regs.st_space; + for st in &mut new_st { + // Only allow access to the 80 lowest bits + *st &= !ST_RESERVED; + } + regs.st_space = new_st; + regs + } + + pub fn set_fx_regs(&mut self, mut new: FloatRegisters) { + { + let old = unsafe { &*(self.kfx.as_ptr().cast::()) }; + new._reserved = old._reserved; + let old_st = new.st_space; + let mut new_st = new.st_space; + for (new_st, old_st) in new_st.iter_mut().zip(&old_st) { + *new_st &= !ST_RESERVED; + *new_st |= old_st & ST_RESERVED; + } + new.st_space = new_st; + + // Make sure we don't use `old` from now on + } + + unsafe { + self.kfx.as_mut_ptr().cast::().write(new); + } + } + + pub fn set_userspace_io_allowed(&mut self, allowed: bool) { + self.arch.userspace_io_allowed = allowed; + + if self.is_current_context() { + unsafe { + crate::arch::gdt::set_userspace_io_allowed(crate::arch::gdt::pcr(), allowed); + } + } + } + + pub(crate) fn current_syscall(&self) -> Option<[usize; 7]> { + if !self.inside_syscall { + return None; + } + let regs = self.regs()?; + let scratch = ®s.scratch; + Some([ + scratch.rax, + scratch.rdi, + scratch.rsi, + scratch.rdx, + scratch.r10, + scratch.r8, + scratch.r9, + ]) + } + + pub(crate) fn read_current_env_regs(&self) -> Result { + // TODO: Avoid rdmsr if fsgsbase is not enabled, if this is worth optimizing for. + unsafe { + Ok(EnvRegisters { + fsbase: msr::rdmsr(msr::IA32_FS_BASE), + gsbase: msr::rdmsr(msr::IA32_KERNEL_GSBASE), + }) + } + } + + pub(crate) fn read_env_regs(&self) -> Result { + Ok(EnvRegisters { + fsbase: self.arch.fsbase as u64, + gsbase: self.arch.gsbase as u64, + }) + } + + pub(crate) fn write_current_env_regs(&mut self, regs: EnvRegisters) -> Result<()> { + if RmmA::virt_is_valid(VirtualAddress::new(regs.fsbase as usize)) + && RmmA::virt_is_valid(VirtualAddress::new(regs.gsbase as usize)) + { + unsafe { + x86::msr::wrmsr(x86::msr::IA32_FS_BASE, regs.fsbase); + // We have to write to KERNEL_GSBASE, because when the kernel returns to + // userspace, it will have executed SWAPGS first. + x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, regs.gsbase); + } + self.arch.fsbase = regs.fsbase as usize; + self.arch.gsbase = regs.gsbase as usize; + + Ok(()) + } else { + Err(Error::new(EINVAL)) + } + } + + pub(crate) fn write_env_regs(&mut self, regs: EnvRegisters) -> Result<()> { + if RmmA::virt_is_valid(VirtualAddress::new(regs.fsbase as usize)) + && RmmA::virt_is_valid(VirtualAddress::new(regs.gsbase as usize)) + { + self.arch.fsbase = regs.fsbase as usize; + self.arch.gsbase = regs.gsbase as usize; + Ok(()) + } else { + Err(Error::new(EINVAL)) + } + } +} + +pub static EMPTY_CR3: Once = Once::new(); + +// SAFETY: EMPTY_CR3 must be initialized. +pub unsafe fn empty_cr3() -> rmm::PhysicalAddress { + unsafe { + debug_assert!(EMPTY_CR3.poll().is_some()); + *EMPTY_CR3.get_unchecked() + } +} + +/// Switch to the next context by restoring its stack and registers +pub unsafe fn switch_to(prev: &mut super::Context, next: &mut super::Context) { + unsafe { + let pcr = crate::arch::gdt::pcr(); + + if let Some(ref stack) = next.kstack { + crate::arch::gdt::set_tss_stack(pcr, stack.initial_top() as usize); + } + crate::arch::gdt::set_userspace_io_allowed(pcr, next.arch.userspace_io_allowed); + + core::arch::asm!( + alternative2!( + feature1: "xsaveopt", + then1: [" + mov eax, 0xffffffff + mov edx, eax + xsaveopt64 [{prev_fx}] + xrstor64 [{next_fx}] + "], + feature2: "xsave", + then2: [" + mov eax, 0xffffffff + mov edx, eax + xsave64 [{prev_fx}] + xrstor64 [{next_fx}] + "], + default: [" + fxsave64 [{prev_fx}] + fxrstor64 [{next_fx}] + "] + ), + prev_fx = in(reg) prev.kfx.as_mut_ptr(), + next_fx = in(reg) next.kfx.as_ptr(), + out("eax") _, + out("edx") _, + ); + + { + core::arch::asm!( + alternative!( + feature: "fsgsbase", + then: [" + mov rax, [{next}+{fsbase_off}] + mov rcx, [{next}+{gsbase_off}] + + rdfsbase rdx + wrfsbase rax + swapgs + rdgsbase rax + wrgsbase rcx + swapgs + + mov [{prev}+{fsbase_off}], rdx + mov [{prev}+{gsbase_off}], rax + "], + // TODO: Most applications will set FSBASE, but won't touch GSBASE. Maybe avoid + // wrmsr or even the swapgs+rdgsbase+wrgsbase+swapgs sequence if they are already + // equal? + default: [" + mov ecx, {MSR_FSBASE} + mov rdx, [{next}+{fsbase_off}] + mov eax, edx + shr rdx, 32 + wrmsr + + mov ecx, {MSR_KERNEL_GSBASE} + mov rdx, [{next}+{gsbase_off}] + mov eax, edx + shr rdx, 32 + wrmsr + + // {prev} + "] + ), + out("rax") _, + out("rdx") _, + out("ecx") _, prev = in(reg) addr_of_mut!(prev.arch), next = in(reg) addr_of!(next.arch), + MSR_FSBASE = const msr::IA32_FS_BASE, + MSR_KERNEL_GSBASE = const msr::IA32_KERNEL_GSBASE, + gsbase_off = const offset_of!(Context, gsbase), + fsbase_off = const offset_of!(Context, fsbase), + ); + } + + (*pcr).percpu.new_addrsp_tmp.set(next.addr_space.clone()); + + switch_to_inner(&mut prev.arch, &mut next.arch) + } +} + +// Check disassembly! +#[unsafe(naked)] +unsafe extern "sysv64" fn switch_to_inner(_prev: &mut Context, _next: &mut Context) { + use Context as Cx; + + core::arch::naked_asm!( + // As a quick reminder for those who are unfamiliar with the System V ABI (extern "C"): + // + // - the current parameters are passed in the registers `rdi`, `rsi`, + // - we can modify scratch registers, e.g. rax + // - we cannot change callee-preserved registers arbitrarily, e.g. rbx, which is why we + // store them here in the first place. + concat!(" + // Save old registers, and load new ones + mov [rdi + {off_rbx}], rbx + mov rbx, [rsi + {off_rbx}] + + mov [rdi + {off_r12}], r12 + mov r12, [rsi + {off_r12}] + + mov [rdi + {off_r13}], r13 + mov r13, [rsi + {off_r13}] + + mov [rdi + {off_r14}], r14 + mov r14, [rsi + {off_r14}] + + mov [rdi + {off_r15}], r15 + mov r15, [rsi + {off_r15}] + + mov [rdi + {off_rbp}], rbp + mov rbp, [rsi + {off_rbp}] + + mov [rdi + {off_rsp}], rsp + mov rsp, [rsi + {off_rsp}] + + // push RFLAGS (can only be modified via stack) + pushfq + // pop RFLAGS into `self.rflags` + pop QWORD PTR [rdi + {off_rflags}] + + // push `next.rflags` + push QWORD PTR [rsi + {off_rflags}] + // pop into RFLAGS + popfq + + // When we return, we cannot even guarantee that the return address on the stack, points to + // the calling function, `context::switch`. Thus, we have to execute this Rust hook by + // ourselves, which will unlock the contexts before the later switch. + + // Note that switch_finish_hook will be responsible for executing `ret`. + jmp {switch_hook} + + "), + + off_rflags = const(offset_of!(Cx, rflags)), + + off_rbx = const(offset_of!(Cx, rbx)), + off_r12 = const(offset_of!(Cx, r12)), + off_r13 = const(offset_of!(Cx, r13)), + off_r14 = const(offset_of!(Cx, r14)), + off_r15 = const(offset_of!(Cx, r15)), + off_rbp = const(offset_of!(Cx, rbp)), + off_rsp = const(offset_of!(Cx, rsp)), + + switch_hook = sym crate::context::switch_finish_hook, + ); +} diff --git a/src/context/context.rs b/src/context/context.rs new file mode 100644 index 0000000000..6d723f498f --- /dev/null +++ b/src/context/context.rs @@ -0,0 +1,1074 @@ +use alloc::{collections::BTreeSet, sync::Arc, vec::Vec}; +use arrayvec::ArrayString; +use core::{ + mem::{self, size_of, ManuallyDrop}, + num::NonZeroUsize, + sync::atomic::{AtomicU32, Ordering}, +}; +use syscall::{SigProcControl, Sigcontrol, UPPER_FDTBL_TAG}; + +use crate::{ + arch::interrupt::InterruptStack, + common::aligned_box::AlignedBox, + context::{ + self, arch, + file::{FileDescriptor, LockedFileDescription}, + }, + cpu_set::{LogicalCpuId, LogicalCpuSet}, + cpu_stats, + ipi::{ipi, IpiKind, IpiTarget}, + memory::{ + allocate_p2frame, deallocate_p2frame, Enomem, Frame, RaiiFrame, RmmA, RmmArch, PAGE_SIZE, + }, + percpu::PercpuBlock, + scheme::{CallerCtx, FileHandle, SchemeId}, + sync::{CleanLockToken, LockToken, RwLock, L1, L3, L4, L5}, + syscall::usercopy::UserSliceRw, +}; + +use crate::syscall::error::{Error, Result, EAGAIN, EBADF, EEXIST, EINVAL, EMFILE, ESRCH}; + +use super::{ + empty_cr3, + memory::{AddrSpaceWrapper, GrantFileRef}, +}; + +/// The status of a context - used for scheduling +#[derive(Clone, Debug)] +pub enum Status { + Runnable, + + // TODO: Rename to SoftBlocked and move status_reason to this variant. + /// Not currently runnable, typically due to some blocking syscall, but it can be trivially + /// unblocked by e.g. signals. + Blocked, + + /// Not currently runnable, and cannot be runnable until manually unblocked, depending on what + /// reason. + HardBlocked { + reason: HardBlockedReason, + }, + Dead { + excp: Option, + }, +} + +impl Status { + pub fn is_runnable(&self) -> bool { + matches!(self, Self::Runnable) + } + pub fn is_soft_blocked(&self) -> bool { + matches!(self, Self::Blocked) + } +} + +#[derive(Clone, Debug)] +pub enum HardBlockedReason { + /// "SIGSTOP", only procmgr is allowed to switch contexts this state + Stopped, + AwaitingMmap { + file_ref: GrantFileRef, + }, + // TODO: PageFaultOom? + NotYetStarted, +} + +const CONTEXT_NAME_CAPAC: usize = 32; + +#[derive(Debug)] +pub enum SyscallFrame { + Free(RaiiFrame), + // The field is used by the consistency checker of the kernel debugger + Used { _frame: Frame }, + Dummy, +} + +/// A context, which is typically mapped to a userspace thread +#[derive(Debug)] +pub struct Context { + pub debug_id: u32, + /// Signal handler + pub sig: Option, + /// Status of context + pub status: Status, + pub status_reason: &'static str, + /// Context running or not + pub running: bool, + /// Current CPU ID + pub cpu_id: Option, + /// Time this context was switched to + pub switch_time: u128, + /// Amount of CPU time used + pub cpu_time: u128, + /// Scheduler CPU affinity. If set, [`cpu_id`] can except [`None`] never be anything else than + /// this value. + pub sched_affinity: LogicalCpuSet, + /// Keeps track of whether this context is currently handling a syscall. Only up-to-date when + /// not running. + pub inside_syscall: bool, + + #[cfg(feature = "syscall_debug")] + pub syscall_debug_info: crate::syscall::debug::SyscallDebugInfo, + + /// Head buffer to use when system call buffers are not page aligned + // TODO: Store in user memory? + pub syscall_head: SyscallFrame, + /// Tail buffer to use when system call buffers are not page aligned + // TODO: Store in user memory? + pub syscall_tail: SyscallFrame, + /// Context should wake up at specified time + pub wake: Option, + /// The architecture specific context + pub arch: arch::Context, + /// Kernel FX - used to store SIMD and FPU registers on context switch + pub kfx: AlignedBox<[u8], { arch::KFX_ALIGN }>, + /// Kernel stack, if located on the heap. + pub kstack: Option, + /// Address space containing a page table lock, and grants. Normally this will have a value, + /// but can be None while the context is being reaped or when a new context is created but has + /// not yet had its address space changed. Note that these are only for user mappings; kernel + /// mappings are universal and independent on address spaces or contexts. + pub addr_space: Option>, + /// The name of the context + pub name: ArrayString, + /// The open files in the scheme + pub files: Arc, + /// All contexts except kmain will primarily live in userspace, and enter the kernel only when + /// interrupts or syscalls occur. This flag is set for all contexts but kmain. + pub userspace: bool, + pub being_sigkilled: bool, + pub fmap_ret: Option, + /// Priority + pub prio: usize, + + // TODO: id can reappear after wraparound? + pub owner_proc_id: Option, + + // TODO: Temporary replacement for existing kernel logic, replace with capabilities! + pub euid: u32, + pub egid: u32, + pub pid: usize, + /// Supplementary group IDs for access control decisions. + pub groups: Vec, + + // See [`PreemptGuard`] + // + // When > 0, preemption is disabled. + pub(super) preempt_locks: usize, +} + +#[derive(Debug)] +pub struct SignalState { + /// Offset to jump to when a signal is received. + pub user_handler: NonZeroUsize, + /// Offset to jump to when a program fault occurs. If None, the context is sigkilled. + pub excp_handler: Option, + + /// Signal control pages, shared memory + pub thread_control: RaiiFrame, + pub proc_control: RaiiFrame, + /// Offset within the control pages of respective word-aligned structs. + pub threadctl_off: u16, + pub procctl_off: u16, +} + +impl Context { + pub fn new(owner_proc_id: Option) -> Result { + static DEBUG_ID: AtomicU32 = AtomicU32::new(1); + let this = Self { + debug_id: DEBUG_ID.fetch_add(1, Ordering::Relaxed), + sig: None, + status: Status::HardBlocked { + reason: HardBlockedReason::NotYetStarted, + }, + status_reason: "", + running: false, + cpu_id: None, + switch_time: 0, + cpu_time: 0, + sched_affinity: LogicalCpuSet::all(), + inside_syscall: false, + syscall_head: SyscallFrame::Free(RaiiFrame::allocate()?), + syscall_tail: SyscallFrame::Free(RaiiFrame::allocate()?), + wake: None, + arch: arch::Context::new(), + kfx: AlignedBox::<[u8], { arch::KFX_ALIGN }>::try_zeroed_slice(crate::arch::kfx_size())?, + kstack: None, + addr_space: None, + name: ArrayString::new(), + files: Arc::new(RwLock::new(FdTbl::new())), + userspace: false, + fmap_ret: None, + prio: 20, + being_sigkilled: false, + owner_proc_id, + + euid: 0, + egid: 0, + pid: 0, + groups: Vec::new(), + + #[cfg(feature = "syscall_debug")] + syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(), + + preempt_locks: 0, + }; + cpu_stats::add_context(); + Ok(this) + } + + pub fn is_preemptable(&self) -> bool { + self.preempt_locks == 0 + } + + /// Block the context, and return true if it was runnable before being blocked + pub fn block(&mut self, reason: &'static str) -> bool { + if self.status.is_runnable() { + self.status = Status::Blocked; + self.status_reason = reason; + true + } else { + false + } + } + + pub fn hard_block(&mut self, reason: HardBlockedReason) -> bool { + if self.status.is_runnable() { + self.status = Status::HardBlocked { reason }; + + true + } else { + false + } + } + + /// Unblock context, and return true if it was blocked before being marked runnable + pub fn unblock(&mut self) -> bool { + if self.unblock_no_ipi() { + // TODO: Only send IPI if currently running? + if let Some(cpu_id) = self.cpu_id + && cpu_id != crate::cpu_id() + { + // Send IPI if not on current CPU + ipi(IpiKind::Wakeup, IpiTarget::Other); + } + + true + } else { + false + } + } + + /// Unblock context without IPI, and return true if it was blocked before being marked runnable + pub fn unblock_no_ipi(&mut self) -> bool { + if self.status.is_soft_blocked() { + self.status = Status::Runnable; + self.status_reason = ""; + + true + } else { + false + } + } + + /// Add a file to the lowest available slot. + /// Return the file descriptor number or None if no slot was found + pub fn add_file( + &self, + file: FileDescriptor, + lock_token: &mut LockToken, + ) -> Option { + self.add_file_min(file, 0, lock_token) + } + + /// Add a file to the lowest available slot greater than or equal to min. + /// Return the file descriptor number or None if no slot was found + pub fn add_file_min( + &self, + file: FileDescriptor, + min: usize, + lock_token: &mut LockToken, + ) -> Option { + self.files.write(lock_token.token()).add_file_min(file, min) + } + + /// Bulk-add multiple files to the POSIX file table + pub fn bulk_add_files_posix( + &self, + files_to_add: Vec, + lock_token: &mut LockToken, + ) -> Option> { + self.files + .write(lock_token.token()) + .bulk_add_files_posix(files_to_add) + } + + /// Bulk-insert multiple files into to the upper file table contiguously + pub fn bulk_insert_files_upper( + &self, + files_to_insert: Vec, + lock_token: &mut LockToken, + ) -> Option> { + self.files + .write(lock_token.token()) + .bulk_insert_files_upper(files_to_insert) + } + + /// Bulk-insert multiple files into to the upper file table manually + pub fn bulk_insert_files_upper_manual( + &self, + files_to_insert: Vec, + handles: &[FileHandle], + lock_token: &mut LockToken, + ) -> Result<()> { + self.files + .write(lock_token.token()) + .bulk_insert_files_upper_manual(files_to_insert, handles) + } + + /// Get a file + pub fn get_file( + &self, + i: FileHandle, + lock_token: &mut LockToken, + ) -> Option { + self.files.read(lock_token.token()).get_file(i) + } + + /// Bulk get files + pub fn bulk_get_files( + &self, + handles: &[FileHandle], + lock_token: &mut LockToken, + ) -> Result> { + self.files.read(lock_token.token()).bulk_get_files(handles) + } + + /// Insert a file with a specific handle number. This is used by dup2 + /// Return the file descriptor number or None if the slot was not empty, or i was invalid + pub fn insert_file( + &self, + i: FileHandle, + file: FileDescriptor, + lock_token: &mut LockToken, + ) -> Option { + self.files.write(lock_token.token()).insert_file(i, file) + } + + /// Remove a file + // TODO: adjust files vector to smaller size if possible + pub fn remove_file( + &self, + i: FileHandle, + lock_token: &mut LockToken, + ) -> Option { + self.files.write(lock_token.token()).remove_file(i) + } + + /// Bulk remove files + pub fn bulk_remove_files( + &self, + handles: &[FileHandle], + lock_token: &mut LockToken, + ) -> Result> { + self.files + .write(lock_token.token()) + .bulk_remove_files(handles) + } + + pub fn is_current_context(&self) -> bool { + self.running && self.cpu_id == Some(crate::cpu_id()) + } + + pub fn addr_space(&self) -> Result<&Arc> { + self.addr_space.as_ref().ok_or(Error::new(ESRCH)) + } + pub fn set_addr_space( + &mut self, + addr_space: Option>, + token: LockToken, + ) -> Option> { + if let (Some(old), Some(new)) = (&self.addr_space, &addr_space) + && Arc::ptr_eq(old, new) + { + return addr_space; + }; + + if self.is_current_context() { + // TODO: Share more code with context::arch::switch_to. + let this_percpu = PercpuBlock::current(); + + if let Some(ref prev_addrsp) = self.addr_space { + assert!(Arc::ptr_eq( + this_percpu.current_addrsp.borrow().as_ref().unwrap(), + prev_addrsp + )); + + // See [`crate::percpu::switch_arch_hook`]. + prev_addrsp.used_by.atomic_clear(this_percpu.cpu_id); + + core::sync::atomic::fence(Ordering::SeqCst); + this_percpu.maybe_handle_tlb_shootdown(); + } + + let _old_addrsp = mem::replace( + &mut *this_percpu.current_addrsp.borrow_mut(), + addr_space.clone(), + ); + + match addr_space { + Some(ref new) => { + new.used_by.atomic_set(this_percpu.cpu_id); + let new_addrsp = new.acquire_read(token); + unsafe { + new_addrsp.table.utable.make_current(); + } + } + _ => unsafe { + crate::memory::RmmA::set_table(rmm::TableKind::User, empty_cr3()); + }, + } + } else { + assert!(!self.running); + } + + core::mem::replace(&mut self.addr_space, addr_space) + } + + fn can_access_regs(&self) -> bool { + self.userspace + } + + pub fn regs(&self) -> Option<&InterruptStack> { + if !self.can_access_regs() { + return None; + } + let kstack = self.kstack.as_ref()?; + Some(unsafe { &*kstack.initial_top().sub(size_of::()).cast() }) + } + pub fn regs_mut(&mut self) -> Option<&mut InterruptStack> { + if !self.can_access_regs() { + return None; + } + let kstack = self.kstack.as_ref()?; + Some(unsafe { &mut *kstack.initial_top().sub(size_of::()).cast() }) + } + pub fn sigcontrol(&mut self) -> Option<(&Sigcontrol, &SigProcControl, &mut SignalState)> { + Some(Self::sigcontrol_raw(self.sig.as_mut()?)) + } + pub fn sigcontrol_raw( + sig: &mut SignalState, + ) -> (&Sigcontrol, &SigProcControl, &mut SignalState) { + let check = |off| { + assert_eq!(usize::from(off) % align_of::(), 0); + assert!(usize::from(off).saturating_add(size_of::()) < PAGE_SIZE); + }; + check(sig.procctl_off); + check(sig.threadctl_off); + + let for_thread = unsafe { + &*(RmmA::phys_to_virt(sig.thread_control.get().base()).data() as *const Sigcontrol) + .byte_add(usize::from(sig.threadctl_off)) + }; + let for_proc = unsafe { + &*(RmmA::phys_to_virt(sig.proc_control.get().base()).data() as *const SigProcControl) + .byte_add(usize::from(sig.procctl_off)) + }; + + (for_thread, for_proc, sig) + } + pub fn caller_ctx(&self) -> CallerCtx { + CallerCtx { + uid: self.euid, + gid: self.egid, + pid: self.pid, + groups: self.groups.clone(), + } + } +} + +/// Wrapper struct for borrowing the syscall head or tail buf. +#[derive(Debug)] +pub struct BorrowedHtBuf { + inner: Option, + head_and_not_tail: bool, +} +impl BorrowedHtBuf { + pub fn head_locked(token: LockToken) -> Result { + let current = context::current(); + let frame = &mut current.write(token).syscall_head; + match mem::replace(frame, SyscallFrame::Dummy) { + SyscallFrame::Free(free_frame) => { + *frame = SyscallFrame::Used { + _frame: free_frame.get(), + }; + Ok(Self { + inner: Some(free_frame), + head_and_not_tail: true, + }) + } + SyscallFrame::Used { .. } | SyscallFrame::Dummy => Err(Error::new(EAGAIN)), + } + } + pub fn tail_locked(token: LockToken) -> Result { + let current = context::current(); + let frame = &mut current.write(token).syscall_tail; + match mem::replace(frame, SyscallFrame::Dummy) { + SyscallFrame::Free(free_frame) => { + *frame = SyscallFrame::Used { + _frame: free_frame.get(), + }; + Ok(Self { + inner: Some(free_frame), + head_and_not_tail: false, + }) + } + SyscallFrame::Used { .. } | SyscallFrame::Dummy => Err(Error::new(EAGAIN)), + } + } + pub fn buf(&self) -> &[u8; PAGE_SIZE] { + unsafe { + &*(RmmA::phys_to_virt(self.inner.as_ref().expect("must succeed").get().base()).data() + as *const [u8; PAGE_SIZE]) + } + } + pub fn buf_mut(&mut self) -> &mut [u8; PAGE_SIZE] { + unsafe { + &mut *(RmmA::phys_to_virt(self.inner.as_mut().expect("must succeed").get().base()) + .data() as *mut [u8; PAGE_SIZE]) + } + } + pub fn frame(&self) -> Frame { + self.inner.as_ref().expect("must succeed").get() + } + /* + pub fn use_for_slice(&mut self, raw: UserSlice) -> Result> { + if raw.len() > self.buf().len() { + return Ok(None); + } + raw.copy_to_slice(&mut self.buf_mut()[..raw.len()])?; + Ok(Some(&self.buf()[..raw.len()])) + } + pub fn use_for_string(&mut self, raw: UserSlice) -> Result<&str> { + let slice = self.use_for_slice(raw)?.ok_or(Error::new(ENAMETOOLONG))?; + core::str::from_utf8(slice).map_err(|_| Error::new(EINVAL)) + } + pub unsafe fn use_for_struct(&mut self) -> Result<&mut T> { + if size_of::() > PAGE_SIZE || align_of::() > PAGE_SIZE { + return Err(Error::new(EINVAL)); + } + self.buf_mut().fill(0_u8); + Ok(unsafe { &mut *self.buf_mut().as_mut_ptr().cast() }) + } + */ + + pub fn into_drop(self, token: &mut CleanLockToken) { + ManuallyDrop::new(self).inner_drop(token); + } + + fn inner_drop(&mut self, token: &mut CleanLockToken) { + let context = context::current(); + + let Some(inner) = self.inner.take() else { + return; + }; + let mut context = context.write(token.token()); + { + *(if self.head_and_not_tail { + &mut context.syscall_head + } else { + &mut context.syscall_tail + }) = SyscallFrame::Free(inner); + } + } +} + +impl Drop for BorrowedHtBuf { + fn drop(&mut self) { + let mut token = unsafe { CleanLockToken::new() }; + self.inner_drop(&mut token); + #[cfg(feature = "drop_panic")] + { + panic!("BorrowedHtBuf dropped"); + } + } +} + +pub struct Kstack { + /// naturally aligned, order 4 + base: Frame, +} +impl Kstack { + pub fn new() -> Result { + Ok(Self { + base: allocate_p2frame(4).ok_or(Enomem)?, + }) + } + pub fn initial_top(&self) -> *mut u8 { + unsafe { (RmmA::phys_to_virt(self.base.base()).data() as *mut u8).add(PAGE_SIZE << 4) } + } + pub fn len(&self) -> usize { + PAGE_SIZE << 4 + } +} + +impl Drop for Kstack { + fn drop(&mut self) { + unsafe { deallocate_p2frame(self.base, 4) } + } +} +impl core::fmt::Debug for Kstack { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "[kstack at {:?}]", self.base) + } +} + +#[derive(Clone, Debug, Default)] +pub struct FdTbl { + pub posix_fdtbl: Vec>, + pub upper_fdtbl: Vec>, + active_count: usize, +} + +pub type LockedFdTbl = RwLock; + +impl FdTbl { + pub fn new() -> Self { + Self { + posix_fdtbl: Vec::new(), + upper_fdtbl: Vec::new(), + active_count: 0, + } + } + + fn strip_tags(index: usize) -> usize { + index & !UPPER_FDTBL_TAG + } + + fn select_fdtbl(&self, index: usize) -> (&Vec>, usize) { + if index & UPPER_FDTBL_TAG == 0 { + (&self.posix_fdtbl, index) + } else { + (&self.upper_fdtbl, Self::strip_tags(index)) + } + } + + fn select_fdtbl_mut(&mut self, index: usize) -> (&mut Vec>, usize) { + if index & UPPER_FDTBL_TAG == 0 { + (&mut self.posix_fdtbl, index) + } else { + (&mut self.upper_fdtbl, Self::strip_tags(index)) + } + } + + fn validate_handles(&self, handles: &[FileHandle]) -> Result<()> { + let mut checked_handles = BTreeSet::new(); + for i in handles { + let index = i.get(); + if Self::strip_tags(index) >= super::CONTEXT_MAX_FILES { + return Err(Error::new(EMFILE)); + } + if !checked_handles.insert(index) { + return Err(Error::new(EBADF)); // Duplicate handle + } + if !matches!(self.get(index), Some(Some(_))) { + return Err(Error::new(EBADF)); + } + } + + Ok(()) + } + + fn validate_free_slots(&self, handles: &[FileHandle]) -> Result<()> { + let mut checked_slots = BTreeSet::new(); + for i in handles { + let index = i.get(); + if Self::strip_tags(index) >= super::CONTEXT_MAX_FILES { + return Err(Error::new(EMFILE)); + } + if !checked_slots.insert(index) { + return Err(Error::new(EINVAL)); // Duplicate slots + } + if matches!(self.get(index), Some(Some(_))) { + return Err(Error::new(EEXIST)); + } + } + + Ok(()) + } + + pub fn add_file_min(&mut self, file: FileDescriptor, min: usize) -> Option { + if self.active_count >= super::CONTEXT_MAX_FILES { + return None; + } + + let tag = min & UPPER_FDTBL_TAG; + + let (fdtbl, min) = self.select_fdtbl_mut(min); + + // Find the first empty slot in the posix_fdtbl starting from `min`. + if let Some((pos, slot)) = fdtbl + .iter_mut() + .enumerate() + .skip(min) + .find(|(_, slot)| slot.is_none()) + { + *slot = Some(file); + self.active_count += 1; + return Some(FileHandle::from(pos | tag)); + }; + + let len = fdtbl.len(); + + // If no empty slot was found, we need to allocate a new slot. + if len >= min { + fdtbl.push(Some(file)); + self.active_count += 1; + Some(FileHandle::from(len | tag)) + } else { + self.insert_file(FileHandle::from(min | tag), file) + } + } + + fn bulk_add_files_posix( + &mut self, + files_to_add: Vec, + ) -> Option> { + let count = files_to_add.len(); + if count == 0 { + return Some(Vec::new()); + } + if self.active_count + count > super::CONTEXT_MAX_FILES { + return None; + } + + let handles = self.find_free_posix_slots(count); + let max_index = handles[count - 1].get(); + if self.posix_fdtbl.len() <= max_index { + // Resize the posix_fdtbl to accommodate the new files. + self.posix_fdtbl.resize(max_index + 1, None); + } + + for (&handle, file) in handles.iter().zip(files_to_add) { + let index = handle.get(); + self.posix_fdtbl[index] = Some(file); + } + + self.active_count += count; + Some(handles) + } + + fn insert_file(&mut self, i: FileHandle, file: FileDescriptor) -> Option { + if self.active_count >= super::CONTEXT_MAX_FILES { + return None; + } + let index = i.get(); + let (fdtbl, real_index) = self.select_fdtbl_mut(index); + + if real_index >= super::CONTEXT_MAX_FILES { + return None; + } + + if real_index >= fdtbl.len() { + fdtbl.resize_with(real_index + 1, || None); + } + + if let Some(slot @ None) = fdtbl.get_mut(real_index) { + *slot = Some(file); + self.active_count += 1; + Some(i) + } else { + None + } + } + + fn bulk_insert_files_upper( + &mut self, + files_to_insert: Vec, + ) -> Option> { + let count = files_to_insert.len(); + if count == 0 { + return Some(Vec::new()); + } + if self.active_count + count > super::CONTEXT_MAX_FILES { + return None; + } + + let index = Self::strip_tags(self.find_free_upper_block(count).get()); + let mut handles = Vec::with_capacity(count); + for (i, file) in files_to_insert.into_iter().enumerate() { + let current_index = index + i; + self.upper_fdtbl[current_index] = Some(file); + handles.push(FileHandle::from(current_index | UPPER_FDTBL_TAG)); + } + + self.active_count += count; + Some(handles) + } + + fn bulk_insert_files_upper_manual( + &mut self, + files_to_insert: Vec, + handles: &[FileHandle], + ) -> Result<()> { + if handles.len() != files_to_insert.len() { + return Err(Error::new(EINVAL)); + } + let count = files_to_insert.len(); + if count == 0 { + return Ok(()); + } + if self.active_count + count > super::CONTEXT_MAX_FILES { + return Err(Error::new(EMFILE)); + } + self.validate_free_slots(handles)?; + + let max_index = handles + .iter() + .map(|h| Self::strip_tags(h.get())) + .max() + .unwrap_or(0); + if self.upper_fdtbl.len() <= max_index { + self.upper_fdtbl.resize_with(max_index + 1, || None); + } + for (file, &handle) in files_to_insert.into_iter().zip(handles) { + let index = Self::strip_tags(handle.get()); + self.upper_fdtbl[index] = Some(file); + } + + self.active_count += count; + Ok(()) + } + + pub fn get(&self, index: usize) -> Option<&Option> { + let (fdtbl, real_index) = self.select_fdtbl(index); + + fdtbl.get(real_index) + } + + pub fn get_mut(&mut self, index: usize) -> Option<&mut Option> { + let (fdtbl, real_index) = self.select_fdtbl_mut(index); + + fdtbl.get_mut(real_index) + } + + pub fn get_file(&self, i: FileHandle) -> Option { + self.get(i.get()).cloned().flatten() + } + + fn bulk_get_files(&self, handles: &[FileHandle]) -> Result> { + // Validate that all handles are valid before proceeding to avoid partial results. + self.validate_handles(handles)?; + + let files = handles + .iter() + .map(|&i| self.get_file(i).expect("File should exist")) + .collect(); + + Ok(files) + } + + // TODO: Faster, cleaner mechanism to get descriptor + // Find a file descriptor by scheme id and number. + pub fn find_by_scheme( + &self, + scheme_id: SchemeId, + scheme_number: usize, + token: &mut LockToken, + ) -> Result { + self.iter() + .flatten() + .find(|&context_fd| { + let desc = context_fd.description.read(token.token()); + desc.scheme == scheme_id && desc.number == scheme_number + }) + .cloned() + .ok_or(Error::new(EBADF)) + } + + fn remove_file(&mut self, i: FileHandle) -> Option { + let index = i.get(); + let (fdtbl, real_index) = self.select_fdtbl_mut(index); + + let removed_file_opt = fdtbl.get_mut(real_index).and_then(|opt| opt.take()); + if removed_file_opt.is_some() { + self.active_count -= 1; + } + + removed_file_opt + } + + fn bulk_remove_files(&mut self, handles: &[FileHandle]) -> Result> { + // Validate that all handles are valid before proceeding to avoid partial results. + self.validate_handles(handles)?; + + let files = handles + .iter() + .map(|&i| self.remove_file(i).expect("File should exist")) + .collect(); + + Ok(files) + } + + fn find_free_posix_slots(&self, count: usize) -> Vec { + let mut free_slots = Vec::with_capacity(count); + + for (i, slot) in self.posix_fdtbl.iter().enumerate() { + if slot.is_none() { + free_slots.push(FileHandle::from(i)); + if free_slots.len() == count { + return free_slots; + } + } + } + + let mut current_len = self.posix_fdtbl.len(); + while free_slots.len() < count { + free_slots.push(FileHandle::from(current_len)); + current_len += 1; + } + free_slots + } + + fn find_free_upper_block(&mut self, len: usize) -> FileHandle { + let mut start = 0; + let mut count = 0; + + for (i, file_opt) in self.upper_fdtbl.iter().enumerate() { + if file_opt.is_none() { + if count == 0 { + start = i; + } + count += 1; + if count == len { + break; + } + } else { + count = 0; + } + } + + if count < len { + if count == 0 { + start = self.upper_fdtbl.len(); + } + let needed = len - count; + self.upper_fdtbl + .resize(self.upper_fdtbl.len() + needed, None); + } + + FileHandle::from(start | UPPER_FDTBL_TAG) + } + + pub fn force_close_all(&mut self, token: &mut CleanLockToken) { + for file_opt in self.iter_mut() { + if let Some(file) = file_opt.take() { + let _ = file.close(token); + } + } + self.active_count = 0; + } +} + +impl FdTbl { + pub fn enumerate(&self) -> impl Iterator)> { + self.posix_fdtbl.iter().enumerate().chain( + self.upper_fdtbl + .iter() + .enumerate() + .map(|(i, fd)| (i | UPPER_FDTBL_TAG, fd)), + ) + } + + pub fn iter(&self) -> impl Iterator> { + self.posix_fdtbl.iter().chain(self.upper_fdtbl.iter()) + } + + pub fn iter_mut(&mut self) -> impl Iterator> { + self.posix_fdtbl + .iter_mut() + .chain(self.upper_fdtbl.iter_mut()) + } +} + +pub fn bulk_add_fds( + descriptions: Vec>, + payload: UserSliceRw, + cloexec: bool, + token: &mut LockToken, +) -> Result { + let cnt = descriptions.len(); + if payload.len() != cnt * size_of::() { + return Err(Error::new(EINVAL)); + } + if descriptions.is_empty() { + return Ok(0); + } + let current_lock = context::current(); + let mut current = current_lock.write(token.token()); + let (current, mut token) = current.token_split(); + + let files: Vec = descriptions + .into_iter() + .map(|description| FileDescriptor { + description, + cloexec, + }) + .collect(); + let handles = current + .bulk_add_files_posix(files, &mut token) + .ok_or(Error::new(EMFILE))?; + let payload_chunks = payload.in_exact_chunks(size_of::()); + for (handle, chunk) in handles.iter().zip(payload_chunks) { + chunk.copy_from_slice(&handle.get().to_ne_bytes())?; + } + Ok(handles.len()) +} + +pub fn bulk_insert_fds( + descriptions: Vec>, + payload: UserSliceRw, + cloexec: bool, + token: &mut LockToken, +) -> Result { + let cnt = descriptions.len(); + if payload.len() != cnt * size_of::() { + return Err(Error::new(EINVAL)); + } + if descriptions.is_empty() { + return Ok(0); + } + let files_iter = descriptions.into_iter().map(|description| FileDescriptor { + description, + cloexec, + }); + let first_fd = payload + .in_exact_chunks(size_of::()) + .next() + .ok_or(Error::new(EINVAL))? + .read_usize()?; + + let current_lock = context::current(); + let mut current = current_lock.write(token.token()); + let (current, mut token) = current.token_split(); + + if first_fd == usize::MAX { + let files = files_iter.collect::>(); + let handles = current + .bulk_insert_files_upper(files, &mut token) + .ok_or(Error::new(EMFILE))?; + let payload_chunks = payload.in_exact_chunks(size_of::()); + for (handle, chunk) in handles.iter().zip(payload_chunks) { + chunk.copy_from_slice(&handle.get().to_ne_bytes())?; + } + Ok(handles.len()) + } else { + let handles: Vec = payload + .usizes() + .map(|res| res.map(|i| FileHandle::from(i | syscall::UPPER_FDTBL_TAG))) + .collect::>()?; + let files = files_iter.collect::>(); + current.bulk_insert_files_upper_manual(files, &handles, &mut token)?; + Ok(handles.len()) + } +} diff --git a/src/context/file.rs b/src/context/file.rs new file mode 100644 index 0000000000..2d3790f147 --- /dev/null +++ b/src/context/file.rs @@ -0,0 +1,104 @@ +//! File structs + +use crate::{ + event, + scheme::{self, SchemeId}, + sync::{CleanLockToken, RwLock, L6}, + syscall::error::Result, +}; +use alloc::sync::Arc; +use syscall::{schemev2::NewFdFlags, RwFlags, O_APPEND, O_NONBLOCK}; + +pub type LockedFileDescription = RwLock; + +/// A file description +#[derive(Clone, Copy, Debug)] +pub struct FileDescription { + /// The current file offset (seek) + pub offset: u64, + /// The scheme that this file refers to + pub scheme: SchemeId, + /// The number the scheme uses to refer to this file + pub number: usize, + /// The flags passed to open or fcntl(SETFL) + pub flags: u32, + pub internal_flags: InternalFlags, +} +bitflags! { + #[derive(Clone, Copy, Debug)] + pub struct InternalFlags: u32 { + const POSITIONED = 1 << 0; + const NOTIFY_ON_NEXT_DETACH = 1 << 1; + } +} +impl FileDescription { + pub fn rw_flags(&self, rw: RwFlags) -> u32 { + let mut ret = self.flags & !(O_NONBLOCK | O_APPEND) as u32; + if rw.contains(RwFlags::APPEND) { + ret |= O_APPEND as u32; + } + if rw.contains(RwFlags::NONBLOCK) { + ret |= O_NONBLOCK as u32; + } + ret + } +} +impl InternalFlags { + pub fn from_extra0(fl: u8) -> Option { + Some( + NewFdFlags::from_bits(fl)? + .iter() + .map(|fd| { + if fd == NewFdFlags::POSITIONED { + Self::POSITIONED + } else { + Self::empty() + } + }) + .collect(), + ) + } +} + +/// A file descriptor +#[derive(Clone, Debug)] +#[must_use = "File descriptors must be closed"] +pub struct FileDescriptor { + /// Corresponding file description + pub description: Arc, + /// Cloexec flag + pub cloexec: bool, +} + +impl FileDescription { + /// Try closing a file, although at this point the description will be destroyed anyway, if + /// doing so fails. + pub fn try_close(self, token: &mut CleanLockToken) -> Result<()> { + event::unregister_file(self.scheme, self.number, token); + + let scheme = scheme::get_scheme(token.token(), self.scheme)?; + + scheme.close(self.number, token) + } +} + +impl FileDescriptor { + pub fn close(self, token: &mut CleanLockToken) -> Result<()> { + { + let (scheme_id, number, internal_flags) = { + let desc = self.description.read(token.token()); + (desc.scheme, desc.number, desc.internal_flags) + }; + if internal_flags.contains(InternalFlags::NOTIFY_ON_NEXT_DETACH) { + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + scheme.detach(number, token)?; + } + } + + if let Ok(file) = Arc::try_unwrap(self.description).map(RwLock::into_inner) { + file.try_close(token)?; + } + + Ok(()) + } +} diff --git a/src/context/memory.rs b/src/context/memory.rs new file mode 100644 index 0000000000..93446ba7a7 --- /dev/null +++ b/src/context/memory.rs @@ -0,0 +1,2984 @@ +use alloc::{collections::BTreeMap, sync::Arc, vec::Vec}; +use arrayvec::ArrayVec; +use core::{ + cmp, + fmt::Debug, + mem::ManuallyDrop, + num::NonZeroUsize, + ops::Bound, + sync::atomic::{AtomicU32, Ordering}, +}; +use rmm::{Arch as _, PageFlush}; +use syscall::{error::*, flag::MapFlags, GrantFlags, MunmapFlags}; + +use crate::{ + context::file::LockedFileDescription, + cpu_set::LogicalCpuSet, + memory::{ + deallocate_frame, get_page_info, init_frame, the_zeroed_frame, AddRefError, Enomem, Frame, + Page, PageFlags, PageInfo, PageMapper, RaiiFrame, RefCount, RefKind, RmmA, TableKind, + VirtualAddress, PAGE_SIZE, + }, + percpu::PercpuBlock, + scheme::{self, KernelSchemes}, + sync::{ + CleanLockToken, LockToken, RwLock, RwLockReadGuard, RwLockUpgradableGuard, + RwLockWriteGuard, L4, L5, + }, +}; + +use super::context::HardBlockedReason; + +pub const MMAP_MIN_DEFAULT: usize = PAGE_SIZE; + +pub fn page_flags(flags: MapFlags) -> PageFlags { + PageFlags::new() + .user(true) + .execute(flags.contains(MapFlags::PROT_EXEC)) + .write(flags.contains(MapFlags::PROT_WRITE)) + //TODO: PROT_READ +} +pub fn map_flags(page_flags: PageFlags) -> MapFlags { + let mut flags = MapFlags::PROT_READ; + if page_flags.has_write() { + flags |= MapFlags::PROT_WRITE; + } + if page_flags.has_execute() { + flags |= MapFlags::PROT_EXEC; + } + flags +} + +pub struct UnmapResult { + pub file_desc: Option, + pub size: usize, + pub flags: MunmapFlags, +} +impl UnmapResult { + pub fn unmap(mut self, token: &mut CleanLockToken) -> Result<()> { + let Some(GrantFileRef { + base_offset, + description, + }) = self.file_desc.take() + else { + return Ok(()); + }; + + let (scheme_id, number) = { + let desc = description.write(token.token()); + (desc.scheme, desc.number) + }; + + let scheme_opt = scheme::get_scheme(token.token(), scheme_id); + let funmap_result = scheme_opt + .and_then(|scheme| scheme.kfunmap(number, base_offset, self.size, self.flags, token)); + + if let Ok(fd) = Arc::try_unwrap(description) { + fd.into_inner().try_close(token)?; + } + funmap_result?; + + Ok(()) + } +} + +#[derive(Debug)] +pub struct AddrSpaceWrapper { + pub inner: RwLock, + pub tlb_ack: AtomicU32, + pub used_by: LogicalCpuSet, +} +impl AddrSpaceWrapper { + pub fn new() -> Result> { + Ok(Arc::new(Self { + inner: RwLock::new(AddrSpace::new()?), + tlb_ack: AtomicU32::new(0), + used_by: LogicalCpuSet::empty(), + })) + } + pub fn acquire_read<'a>( + &'a self, + lock_token: LockToken<'a, L4>, + ) -> RwLockReadGuard<'a, L5, AddrSpace> { + self.inner.read(lock_token) + } + pub fn acquire_upgradeable_read<'a>( + &'a self, + lock_token: LockToken<'a, L4>, + ) -> RwLockUpgradableGuard<'a, L5, AddrSpace> { + self.inner.upgradeable_read(lock_token) + } + pub fn acquire_write<'a>( + &'a self, + lock_token: LockToken<'a, L4>, + ) -> RwLockWriteGuard<'a, L5, AddrSpace> { + self.inner.write(lock_token) + } + pub unsafe fn acquire_reupgradeable_read<'a>( + &'a self, + lock_token: LockToken<'a, L5>, + ) -> RwLockUpgradableGuard<'a, L5, AddrSpace> { + unsafe { self.inner.reupgradeable_read(lock_token) } + } + pub unsafe fn acquire_rewrite<'a>( + &'a self, + lock_token: LockToken<'a, L5>, + ) -> RwLockWriteGuard<'a, L5, AddrSpace> { + unsafe { self.inner.rewrite(lock_token) } + } + pub fn into_drop(self, token: &mut CleanLockToken) { + self.inner.into_inner().into_drop(token); + } +} + +#[derive(Debug)] +pub struct AddrSpace { + pub table: Table, + pub grants: UserGrants, + /// Lowest offset for mmap invocations where the user has not already specified the offset + /// (using MAP_FIXED/MAP_FIXED_NOREPLACE). Cf. Linux's `/proc/sys/vm/mmap_min_addr`, but with + /// the exception that we have a memory safe kernel which doesn't have to protect itself + /// against null pointers, so fixed mmaps to address zero are still allowed. + pub mmap_min: usize, +} +impl AddrSpaceWrapper { + /// Attempt to clone an existing address space so that all mappings are copied (CoW). + pub fn try_clone(&self, token: &mut CleanLockToken) -> Result> { + let mut token = token.token(); + let mut guard = self.acquire_write(token.downgrade()); + let guard = &mut *guard; + + let mut new_arc = AddrSpaceWrapper::new()?; + + let new = + Arc::get_mut(&mut new_arc).expect("expected new address space Arc not to be aliased"); + + let _this_mapper = &mut guard.table.utable; + let this_mapper = &mut guard.table.utable; + let mut this_flusher = Flusher::with_cpu_set(&self.used_by, &self.tlb_ack); + + for (grant_base, grant_info) in guard.grants.iter() { + let new_grant = match grant_info.provider { + // No, your temporary UserScheme mappings will not be kept across forks. + Provider::External { + is_pinned_userscheme_borrow: true, + .. + } + | Provider::AllocatedShared { + is_pinned_userscheme_borrow: true, + .. + } => continue, + + // No, physically contiguous driver memory won't either. + Provider::Allocated { + phys_contiguous: true, + .. + } => continue, + + Provider::PhysBorrowed { base } => Grant::physmap( + base, + PageSpan::new(grant_base, grant_info.page_count), + grant_info.flags, + &mut new.inner.get_mut().table.utable, + &mut NopFlusher, + )?, + Provider::Allocated { + ref cow_file_ref, + phys_contiguous: false, + } => Grant::copy_mappings( + grant_base, + grant_base, + grant_info.page_count, + grant_info.flags, + this_mapper, + &mut new.inner.get_mut().table.utable, + &mut this_flusher, + &mut NopFlusher, + CopyMappingsMode::Owned { + cow_file_ref: cow_file_ref.clone(), + }, + )?, + // TODO: Merge Allocated and AllocatedShared, and make CopyMappingsMode a field? + Provider::AllocatedShared { + is_pinned_userscheme_borrow: false, + } => Grant::copy_mappings( + grant_base, + grant_base, + grant_info.page_count, + grant_info.flags, + this_mapper, + &mut new.inner.get_mut().table.utable, + &mut this_flusher, + &mut NopFlusher, + CopyMappingsMode::Borrowed, + )?, + + // MAP_SHARED grants are retained by reference, across address space clones (the + // "fork" analogue from monolithic kernels). + Provider::External { + ref address_space, + src_base, + .. + } => Grant::borrow_grant( + Arc::clone(address_space), + src_base, + grant_base, + grant_info, + &mut new.inner.get_mut().table.utable, + &mut NopFlusher, + false, + )?, + Provider::FmapBorrowed { .. } => continue, + }; + + new.inner.get_mut().grants.insert(new_grant); + } + Ok(new_arc) + } + pub fn mprotect( + &self, + requested_span: PageSpan, + flags: MapFlags, + token: &mut CleanLockToken, + ) -> Result<()> { + let mut token = token.token(); + let mut guard = self.acquire_write(token.downgrade()); + let guard = &mut *guard; + + let mapper = &mut guard.table.utable; + let mut flusher = Flusher::with_cpu_set(&self.used_by, &self.tlb_ack); + + // TODO: Remove allocation (might require BTreeMap::set_key or interior mutability). + let regions = guard + .grants + .conflicts(requested_span) + .map(|(base, info)| { + if info.is_pinned() { + Err(Error::new(EBUSY)) + } else { + Ok(PageSpan::new(base, info.page_count)) + } + }) + .collect::>(); + + for grant_span_res in regions { + let grant_span = grant_span_res?; + + let grant = guard + .grants + .remove_containing(grant_span.base) + .expect("grant cannot magically disappear while we hold the lock!"); + //info!("Mprotecting {:#?} to {:#?} in {:#?}", grant, flags, grant_span); + let intersection = grant_span.intersection(requested_span); + + let (before, mut grant, after) = grant + .extract(intersection) + .expect("failed to extract grant"); + //info!("Sliced into\n\n{:#?}\n\n{:#?}\n\n{:#?}", before, grant, after); + + if let Some(before) = before { + guard.grants.insert(before); + } + // FIXME: andypython: should this be done at the end? + if let Some(after) = after { + guard.grants.insert(after); + } + + if !grant.info.can_have_flags(flags) { + guard.grants.insert(grant); + return Err(Error::new(EACCES)); + } + + let new_flags = grant + .info + .flags() + // TODO: Require a capability in order to map executable memory? + .execute(flags.contains(MapFlags::PROT_EXEC)) + .write(flags.contains(MapFlags::PROT_WRITE)); + + // TODO: Allow enabling/disabling read access on architectures which allow it. On + // x86_64 with protection keys (although only enforced by userspace), and AArch64 (I + // think), execute-only memory is also supported. + + grant.remap(mapper, &mut flusher, new_flags); + //info!("Mprotect grant became {:#?}", grant); + guard.grants.insert(grant); + } + Ok(()) + } + #[must_use = "needs to notify files"] + pub fn munmap( + &self, + requested_span: PageSpan, + unpin: bool, + token: &mut CleanLockToken, + ) -> Result> { + let mut token = token.token(); + let mut guard = self.acquire_write(token.downgrade()); + let guard = &mut *guard; + + let mut flusher = Flusher::with_cpu_set(&self.used_by, &self.tlb_ack); + AddrSpace::munmap_inner( + &mut guard.grants, + &mut guard.table.utable, + &mut flusher, + requested_span, + unpin, + ) + } + pub fn r#move( + &self, + src_opt: Option<(&AddrSpaceWrapper, &mut AddrSpace)>, + src_span: PageSpan, + requested_dst_base: Option, + new_page_count: usize, + new_flags: MapFlags, + mut notify_files_out: Option<&mut Vec>, + token: LockToken, + ) -> Result { + let dst_lock = self; + // SAFETY: This is moving data between two AddrSpace. Caller ensures the two is a different AddrSpace + let mut dst = unsafe { dst_lock.acquire_rewrite(token) }; + let dst = &mut *dst; + + let mut src_flusher; + let mut src_opt = match src_opt { + Some((aw, a)) => { + src_flusher = Flusher::with_cpu_set(&aw.used_by, &aw.tlb_ack); + Some((&mut a.grants, &mut a.table.utable, &mut src_flusher)) + } + None => None, + }; + let mut dst_flusher = Flusher::with_cpu_set(&dst_lock.used_by, &dst_lock.tlb_ack); + + let dst_base = match requested_dst_base { + Some(base) if new_flags.contains(MapFlags::MAP_FIXED_NOREPLACE) => { + if dst + .grants + .conflicts(PageSpan::new(base, new_page_count)) + .next() + .is_some() + { + return Err(Error::new(EEXIST)); + } + + base + } + Some(base) if new_flags.contains(MapFlags::MAP_FIXED) => { + let unpin = false; + let notify_files = AddrSpace::munmap_inner( + &mut dst.grants, + &mut dst.table.utable, + &mut dst_flusher, + PageSpan::new(base, new_page_count), + unpin, + )?; + if let Some(notify_files_out) = notify_files_out.as_mut() { + notify_files_out.extend(notify_files); + } + + base + } + _ => { + dst.grants + .find_free(dst.mmap_min, cmp::max(new_page_count, src_span.count)) + .ok_or(Error::new(ENOMEM))? + .base + } + }; + + let (src_grants, src_mapper, src_flusher) = match &mut src_opt { + Some((g, m, f)) => (&mut **g, &mut **m, &mut **f), + None => (&mut dst.grants, &mut dst.table.utable, &mut dst_flusher), + }; + + if src_grants + .conflicts(src_span) + .any(|(_, g)| !g.can_extract(false)) + { + return Err(Error::new(EBUSY)); + } + if src_grants + .conflicts(src_span) + .any(|(_, g)| !g.can_have_flags(new_flags)) + { + return Err(Error::new(EPERM)); + } + if PageSpan::new(dst_base, new_page_count).intersects(src_span) { + return Err(Error::new(EBUSY)); + } + + if new_page_count < src_span.count { + let unpin = false; + let notify_files: Vec = AddrSpace::munmap_inner( + src_grants, + src_mapper, + src_flusher, + PageSpan::new( + src_span.base.next_by(new_page_count), + src_span.count - new_page_count, + ), + unpin, + )?; + if let Some(notify_files_out) = notify_files_out.as_mut() { + notify_files_out.extend(notify_files); + } + } + + let mut remaining_src_span = + PageSpan::new(src_span.base, cmp::min(src_span.count, new_page_count)); + + let to_remap = src_grants + .conflicts(remaining_src_span) + .map(|(b, _)| b) + .collect::>(); + + let mut prev_grant_end = src_span.base; + + //while let Some(grant_base) = next(src_opt.as_mut().map(|s| &mut **s), dst, remaining_src_span) { + for grant_base in to_remap { + if prev_grant_end < grant_base { + let hole_page_count = grant_base.offset_from(prev_grant_end); + let hole_span = PageSpan::new( + dst_base.next_by(prev_grant_end.offset_from(src_span.base)), + hole_page_count, + ); + dst.grants.insert(Grant::zeroed( + hole_span, + page_flags(new_flags), + &mut dst.table.utable, + &mut dst_flusher, + false, + )?); + } + + let src_grants = src_opt + .as_mut() + .map_or(&mut dst.grants, |(g, _, _)| &mut *g); + let grant = src_grants + .remove_containing(grant_base) + .expect("grant cannot disappear"); + let grant_span = PageSpan::new(grant.base, grant.info.page_count()); + let (before, middle, after) = grant + .extract(remaining_src_span.intersection(grant_span)) + .expect("called intersect(), must succeed"); + + if let Some(before) = before { + src_grants.insert(before); + } + if let Some(after) = after { + src_grants.insert(after); + } + + let dst_grant_base = dst_base.next_by(middle.base.offset_from(src_span.base)); + let middle_span = middle.span(); + + dst.grants.insert(match src_opt.as_mut() { + Some((_, other_mapper, other_flusher)) => middle.transfer( + dst_grant_base, + page_flags(new_flags), + other_mapper, + Some(&mut dst.table.utable), + other_flusher, + &mut dst_flusher, + )?, + None => middle.transfer( + dst_grant_base, + page_flags(new_flags), + &mut dst.table.utable, + None, + &mut dst_flusher, + &mut NopFlusher, + )?, + }); + + prev_grant_end = middle_span.base.next_by(middle_span.count); + let pages_advanced = prev_grant_end.offset_from(remaining_src_span.base); + remaining_src_span = + PageSpan::new(prev_grant_end, remaining_src_span.count - pages_advanced); + } + + if prev_grant_end < src_span.base.next_by(new_page_count) { + let last_hole_span = PageSpan::new( + dst_base.next_by(prev_grant_end.offset_from(src_span.base)), + new_page_count - prev_grant_end.offset_from(src_span.base), + ); + dst.grants.insert(Grant::zeroed( + last_hole_span, + page_flags(new_flags), + &mut dst.table.utable, + &mut dst_flusher, + false, + )?); + } + + Ok(dst_base) + } + /// Borrows a page from user memory, requiring that the frame be Allocated and read/write. This + /// is intended to be used for user-kernel shared memory. + pub fn borrow_frame_enforce_rw_allocated( + self: &Arc, + page: Page, + token: &mut CleanLockToken, + ) -> Result { + let mut lock_token = token.token(); + let guard = self.acquire_write(lock_token.downgrade()); + + let (_start_page, info) = guard.grants.contains(page).ok_or(Error::new(EINVAL))?; + + if !info.can_have_flags(MapFlags::PROT_READ | MapFlags::PROT_WRITE) { + return Err(Error::new(EPERM)); + } + if !matches!(info.provider, Provider::Allocated { .. }) { + return Err(Error::new(EPERM)); + } + + let mut guard_lock = None; + let frame = if let Some((f, fl)) = guard.table.utable.translate(page.start_address()) + && fl.has_write() + { + Frame::containing(f) + } else { + let (frame, flush, new_guard) = correct_inner(self, guard, page, AccessMode::Write, 0) + .map_err(|_| Error::new(ENOMEM))?; + flush.flush(); + guard_lock = Some(new_guard); + + frame + }; + + let frame = match get_page_info(frame) + .expect("missing page info for Allocated grant") + .add_ref(RefKind::Shared) + { + Ok(_) => Ok(unsafe { RaiiFrame::new_unchecked(frame) }), + Err(AddRefError::RcOverflow) => Err(Error::new(ENOMEM)), + Err(AddRefError::SharedToCow) => unreachable!(), + Err(AddRefError::CowToShared) => unreachable!( + "if it was CoW, it was read-only, but in that case we already called correct_inner" + ), + }; + drop(guard_lock); + + frame + } +} +impl AddrSpace { + pub fn current() -> Result> { + PercpuBlock::current() + .current_addrsp + .borrow() + .clone() + .ok_or(Error::new(ESRCH)) + } + + pub fn new() -> Result { + let utable = unsafe { + PageMapper::create(TableKind::User, crate::memory::TheFrameAllocator) + .ok_or(Error::new(ENOMEM))? + }; + + Ok(Self { + grants: UserGrants::new(), + table: Table { utable }, + mmap_min: MMAP_MIN_DEFAULT, + }) + } + fn munmap_inner( + this_grants: &mut UserGrants, + this_mapper: &mut PageMapper, + this_flusher: &mut Flusher, + mut requested_span: PageSpan, + unpin: bool, + ) -> Result> { + let mut notify_files = Vec::new(); + + let next = |grants: &mut UserGrants, span: PageSpan| { + grants + .conflicts(span) + .map(|(base, info)| { + if info.is_pinned() && !unpin { + Err(Error::new(EBUSY)) + } else if !info.can_extract(unpin) { + Err(Error::new(EINVAL)) + } else { + Ok(PageSpan::new(base, info.page_count)) + } + }) + .next() + }; + + while let Some(conflicting_span_res) = next(this_grants, requested_span) { + let conflicting_span = conflicting_span_res?; + + let mut grant = this_grants + .remove_containing(conflicting_span.base) + .expect("conflicting region didn't exist"); + if unpin { + grant.info.unpin(); + } + + let intersection = conflicting_span.intersection(requested_span); + + requested_span = { + // In the following diagrams [---> indicates a range of + // base..base+count where the [ is at the base and > is at + // base+count. In other words, the [ is part of the range and + // the > is not part of the range. + if conflicting_span.end() < requested_span.end() { + // [------> conflicting_span + // [-------> requested_span + // [---> next requested_span + // or + // [----> conflicting_span + // [----------> requested_span + // [--> next requested_span + PageSpan::new( + conflicting_span.end(), + requested_span.end().offset_from(conflicting_span.end()), + ) + } else { + // [----------> conflicting_span + // [-----> requested_span + // next requested_span + // or + // [--------> conflicting_span + // [--------> requested_span + // next requested_span + PageSpan::empty() + } + }; + + let (before, grant, after) = grant + .extract(intersection) + .expect("conflicting region shared no common parts"); + + // Keep untouched regions + if let Some(before) = before { + this_grants.insert(before); + } + if let Some(after) = after { + this_grants.insert(after); + } + + // Remove irrelevant region + let unmap_result = grant.unmap(this_mapper, this_flusher); + + // Notify scheme that holds grant + if unmap_result.file_desc.is_some() { + notify_files.push(unmap_result); + } + } + + Ok(notify_files) + } + pub fn mmap_anywhere( + &mut self, + dst_lock: &AddrSpaceWrapper, + page_count: NonZeroUsize, + flags: MapFlags, + map: impl FnOnce(Page, PageFlags, &mut PageMapper, &mut Flusher) -> Result, + ) -> Result { + self.mmap(dst_lock, None, page_count, flags, None, map) + } + pub fn mmap( + &mut self, + dst_lock: &AddrSpaceWrapper, + requested_base_opt: Option, + page_count: NonZeroUsize, + flags: MapFlags, + notify_files_out: Option<&mut Vec>, + map: impl FnOnce(Page, PageFlags, &mut PageMapper, &mut Flusher) -> Result, + ) -> Result { + assert_eq!(dst_lock.inner.as_mut_ptr(), self as *mut Self); + + let selected_span = match requested_base_opt { + // TODO: Rename MAP_FIXED+MAP_FIXED_NOREPLACE to MAP_FIXED and + // MAP_FIXED_REPLACE/MAP_REPLACE? + Some(requested_base) => { + let requested_span = PageSpan::new(requested_base, page_count.get()); + + if flags.contains(MapFlags::MAP_FIXED_NOREPLACE) { + if self.grants.conflicts(requested_span).next().is_some() { + return Err(Error::new(EEXIST)); + } + requested_span + } else if flags.contains(MapFlags::MAP_FIXED) { + let unpin = false; + let mut notify_files = Self::munmap_inner( + &mut self.grants, + &mut self.table.utable, + &mut Flusher::with_cpu_set(&dst_lock.used_by, &dst_lock.tlb_ack), + requested_span, + unpin, + )?; + if let Some(notify_files_out) = notify_files_out { + notify_files_out.append(&mut notify_files); + } + + requested_span + } else { + self.grants + .find_free_near(self.mmap_min, page_count.get(), Some(requested_base)) + .ok_or(Error::new(ENOMEM))? + } + } + None => self + .grants + .find_free(self.mmap_min, page_count.get()) + .ok_or(Error::new(ENOMEM))?, + }; + + // TODO: Threads share address spaces, so not only the inactive flusher should be sending + // out IPIs. IPIs will only be sent when downgrading mappings (i.e. when a stale TLB entry + // will not be corrected by a page fault), and will furthermore require proper + // synchronization. + + let grant = map( + selected_span.base, + page_flags(flags), + &mut self.table.utable, + &mut Flusher::with_cpu_set(&dst_lock.used_by, &dst_lock.tlb_ack), + )?; + self.grants.insert(grant); + + Ok(selected_span.base) + } + + pub fn into_drop(self, token: &mut CleanLockToken) { + ManuallyDrop::new(self).inner_drop(token); + } + + fn inner_drop(&mut self, token: &mut CleanLockToken) { + for mut grant in core::mem::take(&mut self.grants).into_iter() { + // Unpinning the grant is allowed, because pinning only occurs in UserScheme calls to + // prevent unmapping the mapped range twice (which would corrupt only the scheme + // provider), but it won't be able to double free any range after this address space + // has been dropped! + grant.info.unpin(); + + // TODO: Optimize away clearing the actual page tables? Since this address space is no + // longer arc-rwlock wrapped, it cannot be referenced `External`ly by borrowing grants, + // so it should suffice to iterate over PageInfos and decrement and maybe deallocate + // the underlying pages (and send some funmaps). + let res = { grant.unmap(&mut self.table.utable, &mut NopFlusher) }; + + let _ = res.unmap(token); + } + } +} + +#[derive(Debug)] +pub struct UserGrants { + // Using a BTreeMap for its range method. + inner: BTreeMap, + // Using a BTreeMap for its range method. + holes: BTreeMap, + // TODO: Would an additional map ordered by (size,start) to allow for O(log n) allocations be + // beneficial? +} + +#[derive(Clone, Copy)] +pub struct PageSpan { + pub base: Page, + pub count: usize, +} +impl PageSpan { + pub fn new(base: Page, count: usize) -> Self { + Self { base, count } + } + pub fn empty() -> Self { + Self { + base: Page::containing_address(VirtualAddress::new(0)), + count: 0, + } + } + pub fn validate_nonempty(address: VirtualAddress, size: usize) -> Option { + Self::validate(address, size).filter(|this| !this.is_empty()) + } + pub fn validate(address: VirtualAddress, size: usize) -> Option { + if !address.data().is_multiple_of(PAGE_SIZE) || !size.is_multiple_of(PAGE_SIZE) { + return None; + } + if address.data().saturating_add(size) > crate::USER_END_OFFSET { + return None; + } + + Some(Self::new( + Page::containing_address(address), + size / PAGE_SIZE, + )) + } + pub fn is_empty(&self) -> bool { + self.count == 0 + } + pub fn intersection(&self, with: PageSpan) -> PageSpan { + Self::between( + cmp::max(self.base, with.base), + cmp::min(self.end(), with.end()), + ) + } + pub fn intersects(&self, with: PageSpan) -> bool { + !self.intersection(with).is_empty() + } + pub fn slice(&self, inner_span: PageSpan) -> (Option, PageSpan, Option) { + (self.before(inner_span), inner_span, self.after(inner_span)) + } + pub fn pages(self) -> impl Iterator { + (0..self.count).map(move |i| self.base.next_by(i)) + } + + pub fn end(&self) -> Page { + self.base.next_by(self.count) + } + + /// Returns the span from the start of self until the start of the specified span. + pub fn before(self, span: Self) -> Option { + assert!(self.base <= span.base); + Some(Self::between(self.base, span.base)).filter(|reg| !reg.is_empty()) + } + + /// Returns the span from the end of the given span until the end of self. + pub fn after(self, span: Self) -> Option { + assert!(span.end() <= self.end()); + Some(Self::between(span.end(), self.end())).filter(|reg| !reg.is_empty()) + } + /// Returns the span between two pages, `[start, end)`, truncating to zero if end < start. + pub fn between(start: Page, end: Page) -> Self { + Self::new( + start, + end.start_address() + .data() + .saturating_sub(start.start_address().data()) + / PAGE_SIZE, + ) + } +} + +impl Default for UserGrants { + fn default() -> Self { + Self::new() + } +} +impl Debug for PageSpan { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "[{:p}:{:p}, {} pages]", + self.base.start_address().data() as *const u8, + self.base + .start_address() + .add(self.count * PAGE_SIZE - 1) + .data() as *const u8, + self.count + ) + } +} + +impl UserGrants { + pub fn new() -> Self { + Self { + inner: BTreeMap::new(), + holes: core::iter::once((VirtualAddress::new(0), crate::USER_END_OFFSET)) + .collect::>(), + } + } + + /// Returns the grant, if any, which occupies the specified page + pub fn contains(&self, page: Page) -> Option<(Page, &GrantInfo)> { + self.inner + .range(..=page) + .next_back() + .filter(|(base, info)| (**base..base.next_by(info.page_count)).contains(&page)) + .map(|(base, info)| (*base, info)) + } + + /// Returns an iterator over all grants that occupy some part of the + /// requested region + pub fn conflicts(&self, span: PageSpan) -> impl Iterator + '_ { + let start = self.contains(span.base); + + // If there is a grant that contains the base page, start searching at the base of that + // grant, rather than the requested base here. + let start_span = start + .map(|(base, info)| PageSpan::new(base, info.page_count)) + .unwrap_or(span); + + self.inner + .range(start_span.base..) + .take_while(move |(base, info)| PageSpan::new(**base, info.page_count).intersects(span)) + .map(|(base, info)| (*base, info)) + } + // TODO: DEDUPLICATE CODE! + pub fn conflicts_mut( + &mut self, + span: PageSpan, + ) -> impl Iterator + '_ { + let start = self.contains(span.base); + + // If there is a grant that contains the base page, start searching at the base of that + // grant, rather than the requested base here. + let start_span = start + .map(|(base, info)| PageSpan::new(base, info.page_count)) + .unwrap_or(span); + + self.inner + .range_mut(start_span.base..) + .take_while(move |(base, info)| PageSpan::new(**base, info.page_count).intersects(span)) + .map(|(base, info)| (*base, info)) + } + /// Return a free region with the specified size + // TODO: Alignment (x86_64: 4 KiB, 2 MiB, or 1 GiB). + // TODO: Support finding grant close to a requested address? + pub fn find_free_near( + &self, + min: usize, + page_count: usize, + _near: Option, + ) -> Option { + // Get first available hole, but do reserve the page starting from zero as most compiled + // languages cannot handle null pointers safely even if they point to valid memory. If an + // application absolutely needs to map the 0th page, they will have to do so explicitly via + // MAP_FIXED/MAP_FIXED_NOREPLACE. + // TODO: Allow explicitly allocating guard pages? Perhaps using mprotect or mmap with + // PROT_NONE? + + let (hole_start, _hole_size) = self + .holes + .iter() + .skip_while(|(hole_offset, hole_size)| hole_offset.data() + **hole_size <= min) + .find(|(hole_offset, hole_size)| { + let avail_size = + if hole_offset.data() <= min && min <= hole_offset.data() + **hole_size { + **hole_size - (min - hole_offset.data()) + } else { + **hole_size + }; + page_count * PAGE_SIZE <= avail_size + })?; + // Create new region + Some(PageSpan::new( + Page::containing_address(VirtualAddress::new(cmp::max(hole_start.data(), min))), + page_count, + )) + } + pub fn find_free(&self, min: usize, page_count: usize) -> Option { + self.find_free_near(min, page_count, None) + } + fn reserve(&mut self, base: Page, page_count: usize) { + let start_address = base.start_address(); + let size = page_count * PAGE_SIZE; + let end_address = base.start_address().add(size); + + let previous_hole = self.holes.range_mut(..start_address).next_back(); + + if let Some((hole_offset, hole_size)) = previous_hole { + let prev_hole_end = hole_offset.data() + *hole_size; + + // Note that prev_hole_end cannot exactly equal start_address, since that would imply + // there is another grant at that position already, as it would otherwise have been + // larger. + + if prev_hole_end > start_address.data() { + // hole_offset must be below (but never equal to) the start address due to the + // `..start_address()` limit; hence, all we have to do is to shrink the + // previous offset. + *hole_size = start_address.data() - hole_offset.data(); + } + if prev_hole_end > end_address.data() { + // The grant is splitting this hole in two, so insert the new one at the end. + self.holes + .insert(end_address, prev_hole_end - end_address.data()); + } + } + + // Next hole + if let Some(hole_size) = self.holes.remove(&start_address) { + let remainder = hole_size - size; + if remainder > 0 { + self.holes.insert(end_address, remainder); + } + } + } + fn unreserve(holes: &mut BTreeMap, base: Page, page_count: usize) { + // TODO + let start_address = base.start_address(); + let size = page_count * PAGE_SIZE; + let end_address = base.start_address().add(size); + + // The size of any possible hole directly after the to-be-freed region. + let exactly_after_size = holes.remove(&end_address); + + // There was a range that began exactly prior to the to-be-freed region, so simply + // increment the size such that it occupies the grant too. If in addition there was a grant + // directly after the grant, include it too in the size. + if let Some((hole_offset, hole_size)) = holes + .range_mut(..start_address) + .next_back() + .filter(|(offset, size)| offset.data() + **size == start_address.data()) + { + *hole_size = end_address.data() - hole_offset.data() + exactly_after_size.unwrap_or(0); + } else { + // There was no free region directly before the to-be-freed region, however will + // now unconditionally insert a new free region where the grant was, and add that extra + // size if there was something after it. + holes.insert(start_address, size + exactly_after_size.unwrap_or(0)); + } + } + pub fn insert(&mut self, mut grant: Grant) { + assert!(self + .conflicts(PageSpan::new(grant.base, grant.info.page_count)) + .next() + .is_none()); + self.reserve(grant.base, grant.info.page_count); + + let before_region = self + .inner + .range(..grant.base) + .next_back() + .filter(|(base, info)| { + base.next_by(info.page_count) == grant.base + && info.can_be_merged_if_adjacent(&grant.info) + }) + .map(|(base, info)| (*base, info.page_count)); + + let after_region = self + .inner + .range(grant.span().end()..) + .next() + .filter(|(base, info)| { + **base == grant.base.next_by(grant.info.page_count) + && info.can_be_merged_if_adjacent(&grant.info) + }) + .map(|(base, info)| (*base, info.page_count)); + + if let Some((before_base, before_page_count)) = before_region { + grant.base = before_base; + grant.info.page_count += before_page_count; + + core::mem::forget(self.inner.remove(&before_base)); + } + if let Some((after_base, after_page_count)) = after_region { + grant.info.page_count += after_page_count; + + core::mem::forget(self.inner.remove(&after_base)); + } + + self.inner.insert(grant.base, grant.info); + } + + pub fn remove_containing(&mut self, page: Page) -> Option { + // Points to the gap *after* the greatest grant smaller than or equal to `page`. + let mut cursor = self.inner.upper_bound_mut(Bound::Included(&page)); + let (&base, info) = cursor.peek_prev()?; + + if (base..base.next_by(info.page_count())).contains(&page) { + let (base, info) = cursor.remove_prev().unwrap(); + Self::unreserve(&mut self.holes, base, info.page_count()); + Some(Grant { base, info }) + } else { + None + } + } + + pub fn iter(&self) -> impl Iterator + '_ { + self.inner.iter().map(|(base, info)| (*base, info)) + } + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + pub fn into_iter(self) -> impl Iterator { + self.inner + .into_iter() + .map(|(base, info)| Grant { base, info }) + } +} + +#[derive(Debug)] +pub struct GrantInfo { + page_count: usize, + flags: PageFlags, + // TODO: Rename to unmapped? + mapped: bool, + pub(crate) provider: Provider, +} + +/// Enumeration of various types of grants. +#[derive(Debug)] +pub enum Provider { + /// The grant is owned, but possibly CoW-shared. + /// + /// The pages this grant spans, need not necessarily be initialized right away, and can be + /// populated either from zeroed frames, the CoW zeroed frame, or from a scheme fmap call, if + /// mapped with MAP_LAZY. All frames must have an available PageInfo. + Allocated { + cow_file_ref: Option, + phys_contiguous: bool, + }, + + /// The grant is owned, but possibly shared. + /// + /// The pages may only be lazily initialized, if the address space has not yet been cloned (when forking). + /// + /// This type of grants is obtained from MAP_SHARED anonymous or `memory:` mappings, i.e. + /// allocated memory that remains shared after address space clones. + AllocatedShared { is_pinned_userscheme_borrow: bool }, + + /// The grant is not owned, but borrowed from physical memory frames that do not belong to the + /// frame allocator. The kernel will forbid borrowing any physical memory range, that the + /// memory map has indicated is regular allocatable RAM. + PhysBorrowed { base: Frame }, + + /// The memory is borrowed directly from another address space. + External { + address_space: Arc, + src_base: Page, + is_pinned_userscheme_borrow: bool, + }, + + /// The memory is MAP_SHARED borrowed from a scheme. + /// + /// Since the address space is not tracked here, all nonpresent pages must be present before + /// the fmap operation completes, unless MAP_LAZY is specified. They are tracked using + /// PageInfo, or treated as PhysBorrowed if any frame lacks a PageInfo. + FmapBorrowed { + file_ref: GrantFileRef, + pin_refcount: usize, + }, +} + +#[derive(Debug)] +pub struct Grant { + pub(crate) base: Page, + pub(crate) info: GrantInfo, +} + +#[derive(Clone, Debug)] +pub struct GrantFileRef { + pub description: Arc, + pub base_offset: usize, +} + +impl Grant { + // TODO: PageCount newtype, to avoid confusion between bytes and pages? + + // `base` must be mapped by the caller. + pub fn allocated_one_page_nomap(base: Page, flags: PageFlags) -> Grant { + Grant { + base, + info: GrantInfo { + page_count: 1, + flags, + mapped: true, + provider: Provider::Allocated { + cow_file_ref: None, + phys_contiguous: false, + }, + }, + } + } + + // TODO: is_pinned + pub fn allocated_shared_one_page( + frame: Frame, + page: Page, + flags: PageFlags, + mapper: &mut PageMapper, + flusher: &mut Flusher, + is_pinned: bool, + ) -> Result { + let info = get_page_info(frame).expect("needs page info"); + + // TODO: + // + // This may not necessarily hold, as even pinned memory can remain shared (e.g. proc: + // borrow), but it would probably be possible to forbid borrowing memory there as well. + // Maybe make it exclusive first using cow(), unless that is too expensive. + // + // assert_eq!(info.refcount(), RefCount::One); + + // Semantically, the page will be shared between the "context struct" and whatever + // else. + info.add_ref(RefKind::Shared) + .expect("must be possible if previously Zero"); + + unsafe { + mapper + .map_phys(page.start_address(), frame.base(), flags) + .ok_or(Error::new(ENOMEM))? + .ignore(); + + flusher.queue(frame, None, TlbShootdownActions::NEW_MAPPING); + } + + Ok(Grant { + base: page, + info: GrantInfo { + page_count: 1, + flags, + mapped: true, + provider: Provider::AllocatedShared { + is_pinned_userscheme_borrow: is_pinned, + }, + }, + }) + } + + pub fn physmap( + phys: Frame, + span: PageSpan, + flags: PageFlags, + mapper: &mut PageMapper, + flusher: &mut impl GenericFlusher, + ) -> Result { + const MAX_EAGER_PAGES: usize = 4096; + + for i in 0..span.count { + if let Some(info) = get_page_info(phys.next_by(i)) { + warn!("Driver tried to physmap the allocator-frame {phys:?} (info {info:?})!"); + return Err(Error::new(EPERM)); + } + } + + for (i, page) in span.pages().enumerate().take(MAX_EAGER_PAGES) { + let frame = phys.next_by(i); + unsafe { + let Some(result) = + mapper.map_phys(page.start_address(), frame.base(), flags.write(false)) + else { + break; + }; + result.ignore(); + + flusher.queue(frame, None, TlbShootdownActions::NEW_MAPPING); + } + } + + Ok(Grant { + base: span.base, + info: GrantInfo { + page_count: span.count, + flags, + mapped: true, + provider: Provider::PhysBorrowed { base: phys }, + }, + }) + } + pub fn zeroed_phys_contiguous( + span: PageSpan, + flags: PageFlags, + mapper: &mut PageMapper, + flusher: &mut Flusher, + ) -> Result { + if !span.count.is_power_of_two() { + warn!("Attempted non-power-of-two zeroed_phys_contiguous allocation, rounding up to next power of two."); + } + + let alloc_order = span.count.next_power_of_two().trailing_zeros(); + let base = crate::memory::allocate_p2frame(alloc_order).ok_or(Enomem)?; + + for (i, page) in span.pages().enumerate() { + let frame = base.next_by(i); + + get_page_info(frame) + .expect("PageInfo must exist for allocated frame") + .refcount + .store(RefCount::One.to_raw(), Ordering::Relaxed); + + unsafe { + let result = mapper + .map_phys(page.start_address(), frame.base(), flags) + .expect("TODO: page table OOM"); + result.ignore(); + + flusher.queue(frame, None, TlbShootdownActions::NEW_MAPPING); + } + } + + Ok(Grant { + base: span.base, + info: GrantInfo { + page_count: span.count, + flags, + mapped: true, + provider: Provider::Allocated { + cow_file_ref: None, + phys_contiguous: true, + }, + }, + }) + } + pub fn zeroed( + span: PageSpan, + flags: PageFlags, + mapper: &mut PageMapper, + flusher: &mut Flusher, + shared: bool, + ) -> Result { + const MAX_EAGER_PAGES: usize = 16; + + let (the_frame, the_frame_info) = the_zeroed_frame(); + + // TODO: Use flush_all after a certain number of pages, otherwise no + + for page in span.pages().take(MAX_EAGER_PAGES) { + // Good thing with lazy page fault handlers, is that if we fail due to ENOMEM here, we + // can continue and let the process face the OOM killer later. + unsafe { + the_frame_info + .add_ref(RefKind::Cow) + .expect("the static zeroed frame cannot be shared!"); + + let Some(result) = + mapper.map_phys(page.start_address(), the_frame.base(), flags.write(false)) + else { + break; + }; + result.ignore(); + flusher.queue(the_frame, None, TlbShootdownActions::NEW_MAPPING); + } + } + + Ok(Grant { + base: span.base, + info: GrantInfo { + page_count: span.count, + flags, + mapped: true, + provider: if shared { + Provider::AllocatedShared { + is_pinned_userscheme_borrow: false, + } + } else { + Provider::Allocated { + cow_file_ref: None, + phys_contiguous: false, + } + }, + }, + }) + } + + // XXX: borrow_grant is needed because of the borrow checker (iterator invalidation), maybe + // borrow_grant/borrow can be abstracted somehow? + pub fn borrow_grant( + src_address_space_lock: Arc, + src_base: Page, + dst_base: Page, + src_info: &GrantInfo, + _mapper: &mut PageMapper, + _dst_flusher: &mut impl GenericFlusher, + _eager: bool, + ) -> Result { + Ok(Grant { + base: dst_base, + info: GrantInfo { + page_count: src_info.page_count, + flags: src_info.flags, + mapped: true, + provider: Provider::External { + src_base, + address_space: src_address_space_lock, + is_pinned_userscheme_borrow: false, + }, + }, + }) + } + + pub fn borrow_fmap( + span: PageSpan, + new_flags: PageFlags, + file_ref: GrantFileRef, + src: Option>, + _lock: &AddrSpaceWrapper, + mapper: &mut PageMapper, + flusher: &mut Flusher, + ) -> Result { + if let Some(src) = src { + let mut guard = src.addr_space_guard; + let mut src_addrspace = &mut *guard; + let mut src_flusher_state = + Flusher::with_cpu_set(&src.addr_space_lock.used_by, &src.addr_space_lock.tlb_ack) + .detach(); + for dst_page in span.pages() { + let src_page = src.src_base.next_by(dst_page.offset_from(span.base)); + + let (frame, is_cow) = match src.mode { + MmapMode::Shared => { + // TODO: Error code for "scheme responded with unmapped page"? + let frame = match src_addrspace + .table + .utable + .translate(src_page.start_address()) + { + Some((phys, _)) => Frame::containing(phys), + // TODO: ensure the correct context is hardblocked, if necessary + None => { + let (frame, _, new_guard) = correct_inner( + src.addr_space_lock, + guard, + src_page, + AccessMode::Read, + 0, + ) + .map_err(|_| Error::new(EIO))?; + guard = new_guard; + frame + } + }; + + (frame, false) + } + MmapMode::Cow => unsafe { + let frame = match guard + .table + .utable + .remap_with(src_page.start_address(), |flags| flags.write(false)) + { + Some((_, phys, _)) => Frame::containing(phys), + // TODO: ensure the correct context is hardblocked, if necessary + None => { + let (frame, _, new_guard) = correct_inner( + src.addr_space_lock, + guard, + src_page, + AccessMode::Read, + 0, + ) + .map_err(|_| Error::new(EIO))?; + guard = new_guard; + frame + } + }; + + (frame, true) + }, + }; + src_addrspace = &mut *guard; + + let frame = if let Some(page_info) = get_page_info(frame) { + match page_info.add_ref(RefKind::Shared) { + Ok(()) => frame, + Err(AddRefError::CowToShared) => unsafe { + let CowResult { + new_frame: new_cow_frame, + old_frame, + } = cow(frame, page_info, RefKind::Shared) + .map_err(|_| Error::new(ENOMEM))?; + + let (old_flags, _, _flush) = src_addrspace + .table + .utable + .remap_with_full(src_page.start_address(), |_, flags| { + Some((new_cow_frame.base(), flags)) + }) + .expect("page did exist"); + + // TODO: flush.ignore() is correct, but seems to be amplifying a + // userspace race condition + // + //flush.ignore(); + + let mut src_flusher = Flusher { + active_cpus: &src.addr_space_lock.used_by, + state: src_flusher_state, + }; + src_flusher.queue( + frame, + None, + TlbShootdownActions::change_of_flags(old_flags, new_flags), + ); + + if let Some(old_frame) = old_frame { + src_flusher.queue(old_frame, None, TlbShootdownActions::FREE); + } + src_flusher_state = src_flusher.detach(); + + // TODO: there used to be an additional remove_ref here, was that + // correct? + + new_cow_frame + }, + Err(AddRefError::SharedToCow) => unreachable!(), + Err(AddRefError::RcOverflow) => return Err(Error::new(ENOMEM)), + } + } else { + frame + }; + + unsafe { + let flush = mapper + .map_phys( + dst_page.start_address(), + frame.base(), + new_flags.write(new_flags.has_write() && !is_cow), + ) + .unwrap(); + flush.ignore(); + + flusher.queue(frame, None, TlbShootdownActions::NEW_MAPPING); + } + } + } + + Ok(Self { + base: span.base, + info: GrantInfo { + page_count: span.count, + mapped: true, + flags: new_flags, + provider: Provider::FmapBorrowed { + file_ref, + pin_refcount: 0, + }, + }, + }) + } + + /// Borrow all pages in the range `[src_base, src_base+page_count)` from `src_address_space`, + /// mapping them into `[dst_base, dst_base+page_count)`. The destination pages will lazily read + /// the page tables of the source pages, but once present in the destination address space, + /// pages that are unmaped or moved will not be made visible to the destination address space. + pub fn borrow( + src_address_space_lock: Arc, + src_address_space: &mut AddrSpace, + src_base: Page, + dst_base: Page, + page_count: usize, + map_flags: MapFlags, + dst_mapper: &mut PageMapper, + dst_flusher: &mut Flusher, + eager: bool, + _allow_phys: bool, + is_pinned_userscheme_borrow: bool, + ) -> Result { + let flags = page_flags(map_flags); + + const MAX_EAGER_PAGES: usize = 4096; + + let src_span = PageSpan::new(src_base, page_count); + let mut prev_span = None; + + for (src_grant_base, src_grant) in src_address_space.grants.conflicts_mut(src_span) { + let grant_span = PageSpan::new(src_grant_base, src_grant.page_count); + let prev_span = prev_span.replace(grant_span); + + if prev_span.is_none() && src_grant_base > src_base { + warn!( + "Grant too far away, prev_span {:?} src_base {:?} grant base {:?} grant {:#?}", + prev_span, src_base, src_grant_base, src_grant + ); + return Err(Error::new(EINVAL)); + } else if let Some(prev) = prev_span + && prev.end() != src_grant_base + { + warn!( + "Hole between grants, prev_span {:?} src_base {:?} grant base {:?} grant {:#?}", + prev_span, src_base, src_grant_base, src_grant + ); + return Err(Error::new(EINVAL)); + } + + if !src_grant.can_have_flags(map_flags) { + return Err(Error::new(EPERM)); + } + + if let Provider::FmapBorrowed { + ref mut pin_refcount, + .. + } = src_grant.provider + { + *pin_refcount += 1; + } + } + + let Some(last_span) = prev_span else { + warn!("Called Grant::borrow, but no grants were there!"); + return Err(Error::new(EINVAL)); + }; + + if last_span.end() < src_span.end() { + warn!("Requested end page too far away from last grant"); + return Err(Error::new(EINVAL)); + } + if eager { + for (i, page) in PageSpan::new(src_base, page_count) + .pages() + .enumerate() + .take(MAX_EAGER_PAGES) + { + let Some((phys, _)) = src_address_space + .table + .utable + .translate(page.start_address()) + else { + continue; + }; + + let writable = match get_page_info(Frame::containing(phys)) { + None => true, + Some(i) => { + if i.add_ref(RefKind::Shared).is_err() { + continue; + }; + + i.allows_writable() + } + }; + + unsafe { + let flush = dst_mapper + .map_phys( + dst_base.next_by(i).start_address(), + phys, + flags.write(flags.has_write() && writable), + ) + .ok_or(Error::new(ENOMEM))?; + flush.ignore(); + + dst_flusher.queue( + Frame::containing(phys), + None, + TlbShootdownActions::NEW_MAPPING, + ); + } + } + } + + Ok(Grant { + base: dst_base, + info: GrantInfo { + page_count, + flags, + mapped: true, + provider: Provider::External { + address_space: src_address_space_lock, + src_base, + is_pinned_userscheme_borrow, + }, + }, + }) + } + pub fn copy_mappings( + src_base: Page, + dst_base: Page, + page_count: usize, + flags: PageFlags, + src_mapper: &mut PageMapper, + dst_mapper: &mut PageMapper, + src_flusher: &mut Flusher, + dst_flusher: &mut impl GenericFlusher, + mode: CopyMappingsMode, + ) -> Result { + let (allows_writable, rk) = match mode { + CopyMappingsMode::Owned { .. } => (false, RefKind::Cow), + CopyMappingsMode::Borrowed => (true, RefKind::Shared), + }; + + // TODO: Page table iterator + for page_idx in 0..page_count { + let src_page = src_base.next_by(page_idx); + let dst_page = dst_base.next_by(page_idx).start_address(); + + let src_frame = match rk { + RefKind::Cow => { + let Some((_, phys, flush)) = (unsafe { + src_mapper.remap_with(src_page.start_address(), |flags| flags.write(false)) + }) else { + // Page is not mapped, let the page fault handler take care of that (initializing + // it to zero). + // + // TODO: If eager, allocate zeroed page if writable, or use *the* zeroed page (also + // for read-only)? + continue; + }; + unsafe { + flush.ignore(); + } + let frame = Frame::containing(phys); + src_flusher.queue(frame, None, TlbShootdownActions::REVOKE_WRITE); + frame + } + RefKind::Shared => { + if let Some((phys, _)) = src_mapper.translate(src_page.start_address()) { + Frame::containing(phys) + } else { + // TODO: Omit the unnecessary subsequent add_ref call. + let new_frame = init_frame(RefCount::One).expect("TODO: handle OOM"); + let src_flush = unsafe { + src_mapper + .map_phys(src_page.start_address(), new_frame.base(), flags) + .expect("TODO: handle OOM") + }; + unsafe { + src_flush.ignore(); + } + src_flusher.queue(new_frame, None, TlbShootdownActions::NEW_MAPPING); + + new_frame + } + } + }; + + let src_frame = { + let src_page_info = get_page_info(src_frame) + .expect("allocated page was not present in the global page array"); + + match src_page_info.add_ref(rk) { + Ok(()) => src_frame, + Err(AddRefError::CowToShared) => { + let CowResult { + new_frame, + old_frame, + } = cow(src_frame, src_page_info, rk).map_err(|_| Enomem)?; + if let Some(old_frame) = old_frame { + src_flusher.queue(old_frame, None, TlbShootdownActions::FREE); + } + + // TODO: Flusher + unsafe { + if let Some((_flags, phys, flush)) = src_mapper + .remap_with_full(src_page.start_address(), |_, f| { + Some((new_frame.base(), f)) + }) + { + // TODO: flush.ignore() is correct, but seems to be amplifying a + // userspace race condition + // + //flush.ignore(); + flush.flush(); + + // FIXME: Is MOVE correct? + src_flusher.queue( + Frame::containing(phys), + None, + TlbShootdownActions::MOVE, + ); + } + } + + new_frame + } + // Cannot be shared and CoW simultaneously. + Err(AddRefError::SharedToCow) => { + // The call to cow() later implicitly removes one ref, so add it here + // first, even if Shared. + if src_page_info.add_ref(RefKind::Shared) == Err(AddRefError::RcOverflow) { + return Err(Enomem); + } + + // TODO: Copy in place, or use a zeroed page? + let CowResult { + new_frame, + old_frame, + } = cow(src_frame, src_page_info, rk).map_err(|_| Enomem)?; + if let Some(old_frame) = old_frame { + src_flusher.queue(old_frame, None, TlbShootdownActions::FREE); + } + new_frame + } + Err(AddRefError::RcOverflow) => return Err(Enomem), + } + }; + + let Some(map_result) = (unsafe { + dst_mapper.map_phys( + dst_page, + src_frame.base(), + flags.write(flags.has_write() && allows_writable), + ) + }) else { + break; + }; + unsafe { + map_result.ignore(); + } + + dst_flusher.queue(src_frame, None, TlbShootdownActions::NEW_MAPPING); + } + + Ok(Grant { + base: dst_base, + info: GrantInfo { + page_count, + flags, + mapped: true, + provider: match mode { + CopyMappingsMode::Owned { cow_file_ref } => Provider::Allocated { + cow_file_ref, + phys_contiguous: false, + }, + CopyMappingsMode::Borrowed => Provider::AllocatedShared { + is_pinned_userscheme_borrow: false, + }, + }, + }, + }) + } + /// Move a grant between two address spaces. + pub fn transfer( + mut self, + dst_base: Page, + flags: PageFlags, + src_mapper: &mut PageMapper, + mut dst_mapper: Option<&mut PageMapper>, + src_flusher: &mut Flusher, + dst_flusher: &mut impl GenericFlusher, + ) -> Result { + assert!(!self.info.is_pinned()); + + for src_page in self.span().pages() { + let dst_page = dst_base.next_by(src_page.offset_from(self.base)); + + // TODO: Validate flags? + let Some((phys, _flags, flush)) = + (unsafe { src_mapper.unmap_phys(src_page.start_address()) }) + else { + continue; + }; + unsafe { + flush.ignore(); + } + src_flusher.queue(Frame::containing(phys), None, TlbShootdownActions::MOVE); + + let dst_mapper = dst_mapper.as_deref_mut().unwrap_or(&mut *src_mapper); + + // TODO: Preallocate to handle OOM? + let flush = unsafe { + dst_mapper + .map_phys(dst_page.start_address(), phys, flags) + .expect("TODO: OOM") + }; + unsafe { + flush.ignore(); + } + dst_flusher.queue( + Frame::containing(phys), + None, + TlbShootdownActions::NEW_MAPPING, + ); + } + + self.base = dst_base; + Ok(self) + } + + // Caller must check this doesn't violate access rights for e.g. shared memory. + pub fn remap( + &mut self, + mapper: &mut PageMapper, + flusher: &mut Flusher, + new_flags: PageFlags, + ) { + assert!(self.info.mapped); + + for page in self.span().pages() { + unsafe { + // Lazy mappings don't require remapping, as info.flags will be updated. + let Some((old_flags, phys, flush)) = + mapper.remap_with_full(page.start_address(), |same_phys, old_flags| { + if !old_flags.has_write() && new_flags.has_write() { + // Page flags will be updated in [`correct_inner`]. + None + } else { + Some((same_phys, new_flags)) + } + }) + else { + continue; + }; + flush.ignore(); + //info!("Remapped page {:?} (frame {:?})", page, Frame::containing(mapper.translate(page.start_address()).unwrap().0)); + flusher.queue( + Frame::containing(phys), + None, + TlbShootdownActions::change_of_flags(old_flags, new_flags), + ); + } + } + + self.info.flags = new_flags; + } + + #[must_use = "will not unmap itself"] + pub fn unmap( + mut self, + mapper: &mut PageMapper, + flusher: &mut impl GenericFlusher, + ) -> UnmapResult { + assert!(self.info.mapped); + assert!(!self.info.is_pinned()); + + if let Provider::External { + ref address_space, + src_base, + .. + } = self.info.provider + { + // TODO: Lock ordering violation + let mut token = unsafe { CleanLockToken::new() }; + let mut token = token.token(); + let mut guard = address_space.acquire_write(token.downgrade()); + + for (_, grant) in guard + .grants + .conflicts_mut(PageSpan::new(src_base, self.info.page_count)) + { + match grant.provider { + Provider::FmapBorrowed { + ref mut pin_refcount, + .. + } => { + *pin_refcount = pin_refcount + .checked_sub(1) + .expect("fmap pinning code is wrong") + } + _ => continue, + } + } + } + + let is_phys_contiguous = matches!( + self.info.provider, + Provider::Allocated { + phys_contiguous: true, + .. + } + ); + + // TODO: Add old debug assertions back, into Flusher. + let is_fmap_shared = match self.info.provider { + Provider::Allocated { .. } => Some(false), + Provider::AllocatedShared { .. } => None, + Provider::External { .. } => None, + Provider::PhysBorrowed { .. } => None, + Provider::FmapBorrowed { .. } => Some(true), + }; + + if is_phys_contiguous { + let (phys_base, _) = mapper.translate(self.base.start_address()).unwrap(); + let base_frame = Frame::containing(phys_base); + + for i in 0..self.info.page_count { + unsafe { + let (phys, _, flush) = mapper + .unmap_phys(self.base.next_by(i).start_address()) + .expect("all physborrowed grants must be fully Present in the page tables"); + flush.ignore(); + + assert_eq!(phys, base_frame.next_by(i).base()); + } + } + + flusher.queue( + base_frame, + Some(NonZeroUsize::new(self.info.page_count).unwrap()), + TlbShootdownActions::FREE, + ); + } else { + for page in self.span().pages() { + // Lazy mappings do not need to be unmapped. + let Some((phys, _, flush)) = (unsafe { mapper.unmap_phys(page.start_address()) }) + else { + continue; + }; + unsafe { + flush.ignore(); + } + + flusher.queue(Frame::containing(phys), None, TlbShootdownActions::FREE); + } + } + + self.info.mapped = false; + + // Dummy value, won't be read. + let provider = core::mem::replace( + &mut self.info.provider, + Provider::AllocatedShared { + is_pinned_userscheme_borrow: false, + }, + ); + + let mut munmap_flags = MunmapFlags::empty(); + munmap_flags.set( + MunmapFlags::NEEDS_SYNC, + is_fmap_shared.unwrap_or(false) && self.info.flags.has_write(), + ); + + UnmapResult { + size: self.info.page_count * PAGE_SIZE, + file_desc: match provider { + Provider::Allocated { cow_file_ref, .. } => cow_file_ref, + Provider::FmapBorrowed { file_ref, .. } => Some(file_ref), + _ => None, + }, + flags: munmap_flags, + } + } + + /// Extract out a region into a separate grant. The return value is as + /// follows: (before, new split, after). Before and after may be `None`, + /// which occurs when the split off region is at the start or end of the + /// page respectively. + /// + /// # Panics + /// + /// Panics if the start or end addresses of the region is not aligned to the + /// page size. To round up the size to the nearest page size, use `.round()` + /// on the region. + /// + /// Also panics if the given region isn't completely contained within the + /// grant. Use `grant.intersect` to find a sub-region that works. + pub fn span(&self) -> PageSpan { + PageSpan::new(self.base, self.info.page_count) + } + pub fn extract(mut self, span: PageSpan) -> Option<(Option, Grant, Option)> { + assert!(self.info.can_extract(false)); + + let (before_span, this_span, after_span) = self.span().slice(span); + + let before_grant = before_span.map(|span| Grant { + base: span.base, + info: GrantInfo { + flags: self.info.flags, + mapped: self.info.mapped, + page_count: span.count, + provider: match self.info.provider { + Provider::External { + ref address_space, + src_base, + .. + } => Provider::External { + address_space: Arc::clone(address_space), + src_base, + is_pinned_userscheme_borrow: false, + }, + Provider::Allocated { + ref cow_file_ref, .. + } => Provider::Allocated { + cow_file_ref: cow_file_ref.clone(), + phys_contiguous: false, + }, + Provider::AllocatedShared { .. } => Provider::AllocatedShared { + is_pinned_userscheme_borrow: false, + }, + Provider::PhysBorrowed { base } => Provider::PhysBorrowed { base }, + Provider::FmapBorrowed { ref file_ref, .. } => Provider::FmapBorrowed { + file_ref: file_ref.clone(), + pin_refcount: 0, + }, + }, + }, + }); + + let middle_page_offset = before_grant.as_ref().map_or(0, |g| g.info.page_count); + + match self.info.provider { + Provider::PhysBorrowed { ref mut base } => *base = base.next_by(middle_page_offset), + Provider::FmapBorrowed { + ref mut file_ref, .. + } + | Provider::Allocated { + cow_file_ref: Some(ref mut file_ref), + .. + } => file_ref.base_offset += middle_page_offset * PAGE_SIZE, + Provider::Allocated { + cow_file_ref: None, .. + } + | Provider::AllocatedShared { .. } + | Provider::External { .. } => (), + } + + let after_grant = after_span.map(|span| Grant { + base: span.base, + info: GrantInfo { + flags: self.info.flags, + mapped: self.info.mapped, + page_count: span.count, + provider: match self.info.provider { + Provider::Allocated { + cow_file_ref: None, .. + } => Provider::Allocated { + cow_file_ref: None, + phys_contiguous: false, + }, + Provider::AllocatedShared { .. } => Provider::AllocatedShared { + is_pinned_userscheme_borrow: false, + }, + Provider::Allocated { + cow_file_ref: Some(ref file_ref), + .. + } => Provider::Allocated { + cow_file_ref: Some(GrantFileRef { + base_offset: file_ref.base_offset + this_span.count * PAGE_SIZE, + description: Arc::clone(&file_ref.description), + }), + phys_contiguous: false, + }, + Provider::External { + ref address_space, + src_base, + .. + } => Provider::External { + address_space: Arc::clone(address_space), + src_base, + is_pinned_userscheme_borrow: false, + }, + + Provider::PhysBorrowed { base } => Provider::PhysBorrowed { + base: base.next_by(this_span.count), + }, + Provider::FmapBorrowed { ref file_ref, .. } => Provider::FmapBorrowed { + file_ref: GrantFileRef { + base_offset: file_ref.base_offset + this_span.count * PAGE_SIZE, + description: Arc::clone(&file_ref.description), + }, + pin_refcount: 0, + }, + }, + }, + }); + + self.base = this_span.base; + self.info.page_count = this_span.count; + + Some((before_grant, self, after_grant)) + } +} +impl GrantInfo { + pub fn is_pinned(&self) -> bool { + matches!( + self.provider, + Provider::External { + is_pinned_userscheme_borrow: true, + .. + } | Provider::AllocatedShared { + is_pinned_userscheme_borrow: true, + .. + } | Provider::FmapBorrowed { + pin_refcount: 1.., + .. + } + ) + } + pub fn can_extract(&self, unpin: bool) -> bool { + (!self.is_pinned() || unpin) + | matches!( + self.provider, + Provider::Allocated { + phys_contiguous: true, + .. + } + ) + } + pub fn unpin(&mut self) { + if let Provider::External { + ref mut is_pinned_userscheme_borrow, + .. + } + | Provider::AllocatedShared { + ref mut is_pinned_userscheme_borrow, + .. + } = self.provider + { + *is_pinned_userscheme_borrow = false; + } + } + + pub fn flags(&self) -> PageFlags { + self.flags + } + pub fn page_count(&self) -> usize { + self.page_count + } + pub fn can_have_flags(&self, flags: MapFlags) -> bool { + // TODO: read (some architectures support execute-only pages) + let is_downgrade = (self.flags.has_write() || !flags.contains(MapFlags::PROT_WRITE)) + && (self.flags.has_execute() || !flags.contains(MapFlags::PROT_EXEC)); + + match self.provider { + Provider::Allocated { .. } => true, + _ => is_downgrade, + } + } + + pub fn can_be_merged_if_adjacent(&self, with: &Self) -> bool { + if self.mapped != with.mapped || self.flags.data() != with.flags.data() { + return false; + } + + match (&self.provider, &with.provider) { + ( + Provider::Allocated { + cow_file_ref: None, + phys_contiguous: false, + }, + Provider::Allocated { + cow_file_ref: None, + phys_contiguous: false, + }, + ) => true, + //(Provider::PhysBorrowed { base: ref lhs }, Provider::PhysBorrowed { base: ref rhs }) => lhs.next_by(self.page_count) == rhs.clone(), + //(Provider::External { address_space: ref lhs_space, src_base: ref lhs_base, cow: lhs_cow, .. }, Provider::External { address_space: ref rhs_space, src_base: ref rhs_base, cow: rhs_cow, .. }) => Arc::ptr_eq(lhs_space, rhs_space) && lhs_cow == rhs_cow && lhs_base.next_by(self.page_count) == rhs_base.clone(), + _ => false, + } + } + pub fn grant_flags(&self) -> GrantFlags { + let mut flags = GrantFlags::empty(); + // TODO: has_read + flags.set(GrantFlags::GRANT_READ, true); + + flags.set(GrantFlags::GRANT_WRITE, self.flags.has_write()); + flags.set(GrantFlags::GRANT_EXEC, self.flags.has_execute()); + + // TODO: Set GRANT_LAZY + + match self.provider { + Provider::External { + is_pinned_userscheme_borrow, + .. + } => { + flags.set(GrantFlags::GRANT_PINNED, is_pinned_userscheme_borrow); + flags |= GrantFlags::GRANT_SHARED; + } + Provider::Allocated { + ref cow_file_ref, + phys_contiguous, + } => { + // !GRANT_SHARED is equivalent to "GRANT_PRIVATE" + flags.set(GrantFlags::GRANT_SCHEME, cow_file_ref.is_some()); + flags.set(GrantFlags::GRANT_PHYS_CONTIGUOUS, phys_contiguous); + } + Provider::AllocatedShared { + is_pinned_userscheme_borrow, + } => { + flags |= GrantFlags::GRANT_SHARED; + flags.set(GrantFlags::GRANT_PINNED, is_pinned_userscheme_borrow); + } + Provider::PhysBorrowed { .. } => { + flags |= GrantFlags::GRANT_SHARED | GrantFlags::GRANT_PHYS; + } + Provider::FmapBorrowed { .. } => { + flags |= GrantFlags::GRANT_SHARED | GrantFlags::GRANT_SCHEME; + } + } + + flags + } + pub fn file_ref(&self) -> Option<&GrantFileRef> { + match self.provider { + Provider::FmapBorrowed { ref file_ref, .. } + | Provider::Allocated { + cow_file_ref: Some(ref file_ref), + .. + } => Some(file_ref), + _ => None, + } + } +} + +impl Drop for GrantInfo { + #[track_caller] + fn drop(&mut self) { + // XXX: This will not show the address... + assert!( + !self.mapped, + "Grant dropped while still mapped: {:#x?}", + self + ); + } +} + +pub const DANGLING: usize = 1 << (usize::BITS - 2); + +#[derive(Debug)] +pub struct Table { + pub utable: PageMapper, +} + +impl Drop for AddrSpace { + fn drop(&mut self) { + let mut token = unsafe { CleanLockToken::new() }; + self.inner_drop(&mut token); + #[cfg(feature = "drop_panic")] + { + panic!("AddrSpace dropped"); + } + } +} + +impl Drop for Table { + fn drop(&mut self) { + if self.utable.is_current() { + // TODO: Do not flush (we immediately context switch after exit(), what else is there + // to do?). Instead, we can garbage-collect such page tables in the idle kernel context + // before it waits for interrupts. Or maybe not, depends on what future benchmarks will + // indicate. + unsafe { + RmmA::set_table(TableKind::User, super::empty_cr3()); + } + } + unsafe { + deallocate_frame(Frame::containing(self.utable.table().phys())); + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum AccessMode { + Read, + Write, + InstrFetch, +} + +#[derive(Debug)] +pub enum PfError { + Segv, + Oom, + NonfatalInternalError, + // TODO: Handle recursion limit by mapping a zeroed page? Or forbid borrowing borrowed memory, + // and ensure pages are mapped at grant time? + RecursionLimitExceeded, +} + +pub struct CowResult { + /// New frame, which has been given an exclusive reference the caller can use. + pub new_frame: Frame, + + /// Old frame. The caller must decrease its refcount if present, after it has shot down the TLB + /// of other CPUs properly. + pub old_frame: Option, +} + +/// Consumes an existing reference to old_frame, and then returns an exclusive frame, with refcount +/// either preinitialized to One or Shared(2) depending on initial_ref_kind. This may be the same +/// frame, or (if the refcount is modified simultaneously) a new frame whereas the old frame is +/// deallocated. +fn cow( + old_frame: Frame, + old_info: &PageInfo, + initial_ref_kind: RefKind, +) -> Result { + let old_refcount = old_info.refcount(); + assert!(old_refcount.is_some()); + + let initial_rc = match initial_ref_kind { + RefKind::Cow => RefCount::One, + RefKind::Shared => RefCount::Shared(NonZeroUsize::new(2).unwrap()), + }; + + if old_refcount == Some(RefCount::One) { + // We were lucky; the frame was already exclusively owned, so the refcount cannot be + // modified unless we modify it. This is the special case where the old_frame returned is + // None. + + if initial_ref_kind == RefKind::Shared { + old_info + .refcount + .store(initial_rc.to_raw(), Ordering::Relaxed); + } + return Ok(CowResult { + new_frame: old_frame, + old_frame: None, + }); + } + + let new_frame = init_frame(initial_rc)?; + + if old_frame != the_zeroed_frame().0 { + unsafe { + copy_frame_to_frame_directly(new_frame, old_frame); + } + } + + Ok(CowResult { + new_frame, + old_frame: Some(old_frame), + }) +} + +fn map_zeroed( + mapper: &mut PageMapper, + page: Page, + page_flags: PageFlags, + _writable: bool, +) -> Result { + let new_frame = init_frame(RefCount::One)?; + + unsafe { + mapper + .map_phys(page.start_address(), new_frame.base(), page_flags) + .ok_or(PfError::Oom)? + .ignore(); + } + + Ok(new_frame) +} + +pub unsafe fn copy_frame_to_frame_directly(dst: Frame, src: Frame) { + // Optimized exact-page-size copy function? + + // TODO: For new frames, when the kernel's linear phys=>virt mappings are 4k, this is almost + // guaranteed to cause either one (or two) TLB misses. + + let dst = RmmA::phys_to_virt(dst.base()).data() as *mut u8; + let src = RmmA::phys_to_virt(src.base()).data() as *const u8; + + unsafe { + dst.copy_from_nonoverlapping(src, PAGE_SIZE); + } +} + +pub fn try_correcting_page_tables( + faulting_page: Page, + access: AccessMode, + token: &mut CleanLockToken, +) -> Result<(), PfError> { + let Ok(addr_space) = AddrSpace::current() else { + debug!("User page fault without address space being set."); + return Err(PfError::Segv); + }; + + let mut lock_token = token.token(); + let addr_space_lock = addr_space.acquire_write(lock_token.downgrade()); + + let (_, flush, _) = correct_inner(&addr_space, addr_space_lock, faulting_page, access, 0)?; + + flush.flush(); + + Ok(()) +} + +// TODO: maybe refactor the return type into a struct/typedef? +#[expect(clippy::type_complexity)] +/// XXX: This require passing L5 addr_space_guard. +/// Caller must ensure there's no other lock being held at this point. +/// Caller also need to provide clean token for the new AddrSpace. +fn correct_inner<'l>( + addr_space_lock: &'l Arc, + mut addr_space: RwLockWriteGuard<'l, L5, AddrSpace>, + faulting_page: Page, + access: AccessMode, + recursion_level: u32, +) -> Result<(Frame, PageFlush, RwLockWriteGuard<'l, L5, AddrSpace>), PfError> { + let mut flusher = Flusher::with_cpu_set(&addr_space_lock.used_by, &addr_space_lock.tlb_ack); + + let Some((grant_base, grant_info)) = addr_space.grants.contains(faulting_page) else { + debug!("Lacks grant"); + return Err(PfError::Segv); + }; + + let pages_from_grant_start = faulting_page.offset_from(grant_base); + + let grant_flags = grant_info.flags(); + match access { + // TODO: has_read + AccessMode::Read => (), + + AccessMode::Write if !grant_flags.has_write() => { + debug!("Write, but grant was not PROT_WRITE."); + return Err(PfError::Segv); + } + AccessMode::InstrFetch if !grant_flags.has_execute() => { + debug!("Instuction fetch, but grant was not PROT_EXEC."); + return Err(PfError::Segv); + } + + _ => (), + } + + // By now, the memory at the faulting page is actually valid, but simply not yet mapped, either + // at all, or with the required flags. + + let faulting_frame_opt = addr_space + .table + .utable + .translate(faulting_page.start_address()) + .map(|(phys, _page_flags)| Frame::containing(phys)); + let faulting_pageinfo_opt = faulting_frame_opt.map(|frame| (frame, get_page_info(frame))); + + // TODO: Aligned readahead? AMD Zen3+ CPUs can smash 4 4k pages that are 16k-aligned, into a + // single TLB entry, thus emulating 16k pages albeit with higher page table overhead. With the + // correct madvise information, allocating 4 contiguous pages and mapping them together, might + // be a useful future optimization. + // + // TODO: Readahead backwards, i.e. MAP_GROWSDOWN. + + let mut allow_writable = true; + + let frame = match grant_info.provider { + Provider::Allocated { .. } | Provider::AllocatedShared { .. } + if access == AccessMode::Write => + { + match faulting_pageinfo_opt { + Some((_, None)) => unreachable!("allocated page needs frame to be valid"), + Some((frame, Some(info))) => { + if info.allows_writable() { + frame + } else { + let result = cow(frame, info, RefKind::Cow)?; + if let Some(old_frame) = result.old_frame { + flusher.queue(old_frame, None, TlbShootdownActions::FREE); + } + result.new_frame + } + } + _ => map_zeroed( + &mut addr_space.table.utable, + faulting_page, + grant_flags, + true, + )?, + } + } + + Provider::Allocated { .. } | Provider::AllocatedShared { .. } => { + match faulting_pageinfo_opt { + Some((_, None)) => unreachable!("allocated page needs frame to be valid"), + + // TODO: Can this match arm even be reached? In other words, can the TLB cache + // remember that pages are not present? + Some((frame, Some(page_info))) => { + // Keep in mind that allow_writable must always be true if this code is reached + // for AllocatedShared, since shared pages cannot be mapped lazily (without + // using AddrSpace backrefs). + allow_writable = page_info.allows_writable(); + + frame + } + + None => { + // TODO: the zeroed page first, readonly? + map_zeroed( + &mut addr_space.table.utable, + faulting_page, + grant_flags, + false, + )? + } + } + } + Provider::PhysBorrowed { base } => base.next_by(pages_from_grant_start), + Provider::External { + address_space: ref foreign_address_space, + src_base, + .. + } => { + let foreign_address_space = Arc::clone(foreign_address_space); + + if Arc::ptr_eq(addr_space_lock, &foreign_address_space) { + return Err(PfError::NonfatalInternalError); + } + + // XXX: This is cheating, but guaranteed from Arc::ptr_eq above we won't deadlock + let mut free_token = unsafe { CleanLockToken::new() }; + let mut guard = foreign_address_space.acquire_upgradeable_read(free_token.downgrade()); + let src_page = src_base.next_by(pages_from_grant_start); + + match guard.grants.contains(src_page) { + Some(_) => { + let src_frame = match guard.table.utable.translate(src_page.start_address()) { + Some((phys, _)) => Frame::containing(phys), + _ => { + // Grant was valid (TODO check), but we need to correct the underlying page. + // TODO: Access mode + + // TODO: Reasonable maximum? + let new_recursion_level = recursion_level + .checked_add(1) + .filter(|new_lvl| *new_lvl < 16) + .ok_or(PfError::RecursionLimitExceeded)?; + + let guard_token = guard.into_token(); + let addr_space_guard_token = addr_space.into_token(); + drop(flusher); + + // FIXME: Can this result in invalid address space state? + let ext_addrspace = &foreign_address_space; + let mut free_token = unsafe { CleanLockToken::new() }; + let (frame, _, _) = { + let g = ext_addrspace.acquire_write(free_token.downgrade()); + correct_inner( + ext_addrspace, + g, + src_page, + AccessMode::Read, + new_recursion_level, + )? + }; + + // SAFETY: Caller guarantees addr_space_guard is coming from this addr_space_lock + addr_space = + unsafe { addr_space_lock.acquire_rewrite(addr_space_guard_token) }; + flusher = Flusher::with_cpu_set( + &addr_space_lock.used_by, + &addr_space_lock.tlb_ack, + ); + + // SAFETY: We guarantee that guard is coming from foreign_address_space + guard = unsafe { + foreign_address_space.acquire_reupgradeable_read(guard_token) + }; + + frame + } + }; + + let info = + get_page_info(src_frame).expect("all allocated frames need a PageInfo"); + + match info.add_ref(RefKind::Shared) { + Ok(()) => src_frame, + Err(AddRefError::CowToShared) => { + let CowResult { + new_frame, + old_frame, + } = cow(src_frame, info, RefKind::Shared)?; + + if let Some(old_frame) = old_frame { + flusher.queue(old_frame, None, TlbShootdownActions::FREE); + flusher.flush(); + } + + let mut guard = RwLockUpgradableGuard::upgrade(guard); + + // TODO: flusher + unsafe { + guard + .table + .utable + .remap_with_full(src_page.start_address(), |_, f| { + Some((new_frame.base(), f)) + }); + } + + new_frame + } + Err(AddRefError::SharedToCow) => unreachable!(), + Err(AddRefError::RcOverflow) => return Err(PfError::Oom), + } + } + _ => { + // Grant did not exist, but we did own a Provider::External mapping, and cannot + // simply let the current context fail. TODO: But all borrowed memory shouldn't + // really be lazy though? TODO: Should a grant be created? + + let mut guard = RwLockUpgradableGuard::upgrade(guard); + + // TODO: Should this be called? + warn!("Mapped zero page since grant didn't exist"); + map_zeroed( + &mut guard.table.utable, + src_page, + grant_flags, + access == AccessMode::Write, + )? + } + } + } + // TODO: NonfatalInternalError if !MAP_LAZY and this page fault occurs. + Provider::FmapBorrowed { ref file_ref, .. } => { + let file_ref = file_ref.clone(); + let flags = map_flags(grant_info.flags()); + drop(flusher); + let addr_space_guard_token = addr_space.into_token(); + + // XXX: This is cheating, but guaranteed we won't deadlock because we've dropped addr_space_guard + let mut token = unsafe { CleanLockToken::new() }; + + let (scheme_id, scheme_number) = { + let desc = &file_ref.description.read(token.token()); + (desc.scheme, desc.number) + }; + let user_inner = scheme::get_scheme(token.token(), scheme_id) + .ok() + .and_then(|s| { + if let KernelSchemes::User(user) = s { + Some(user.inner) + } else { + None + } + }) + .ok_or(PfError::Segv)?; + + let offset = file_ref.base_offset as u64 + (pages_from_grant_start * PAGE_SIZE) as u64; + user_inner + .request_fmap(scheme_number, offset, 1, flags, &mut token) + .unwrap(); + + let context_lock = crate::context::current(); + context_lock + .write(token.token()) + .hard_block(HardBlockedReason::AwaitingMmap { file_ref }); + + super::switch(&mut token); + + let frame = context_lock + .write(token.token()) + .fmap_ret + .take() + .ok_or(PfError::NonfatalInternalError)?; + + // SAFETY: Caller guarantees addr_space_guard is coming from this addr_space_lock + addr_space = unsafe { addr_space_lock.acquire_rewrite(addr_space_guard_token) }; + flusher = Flusher::with_cpu_set(&addr_space_lock.used_by, &addr_space_lock.tlb_ack); + + info!("Got frame {:?} from external fmap", frame); + + frame + } + }; + + let new_flags = grant_flags.write(grant_flags.has_write() && allow_writable); + let Some(flush) = (unsafe { + addr_space + .table + .utable + .map_phys(faulting_page.start_address(), frame.base(), new_flags) + }) else { + // TODO + return Err(PfError::Oom); + }; + + drop(flusher); + Ok((frame, flush, addr_space)) +} + +#[derive(Debug)] +pub enum MmapMode { + Cow, + Shared, +} + +pub struct BorrowedFmapSource<'a> { + pub src_base: Page, + pub mode: MmapMode, + // TODO: There should be a method that obtains the lock from the guard. + pub addr_space_lock: &'a Arc, + pub addr_space_guard: RwLockWriteGuard<'a, L5, AddrSpace>, +} + +pub fn handle_notify_files(notify_files: Vec, token: &mut CleanLockToken) { + for file in notify_files { + let _ = file.unmap(token); + } +} + +pub enum CopyMappingsMode { + Owned { cow_file_ref: Option }, + Borrowed, +} + +// TODO: Check if polymorphism is worth it in terms of code size performance penalty vs optimized +// away checks. +pub trait GenericFlusher { + // TODO: Don't require a frame unless FREE, require Page otherwise + fn queue( + &mut self, + frame: Frame, + phys_contiguous_count: Option, + actions: TlbShootdownActions, + ); +} +pub struct NopFlusher; +impl GenericFlusher for NopFlusher { + fn queue( + &mut self, + frame: Frame, + phys_contiguous_count: Option, + actions: TlbShootdownActions, + ) { + if actions.contains(TlbShootdownActions::FREE) { + handle_free_action(frame, phys_contiguous_count); + } + } +} +fn handle_free_action(base: Frame, phys_contiguous_count: Option) { + if let Some(count) = phys_contiguous_count { + for i in 0..count.get() { + let frame = base.next_by(i); + let new_rc = get_page_info(frame) + .expect("phys_contiguous frames all need PageInfos") + .remove_ref(); + + if new_rc.is_none() { + // FIXME use a single deallocate_p2frame when possible + unsafe { + deallocate_frame(frame); + } + } + } + } else { + let Some(info) = get_page_info(base) else { + return; + }; + if info.remove_ref().is_none() { + unsafe { + deallocate_frame(base); + } + } + } +} +#[derive(Debug)] +struct FlusherState<'addrsp> { + // TODO: what capacity? + pagequeue: ArrayVec, + dirty: bool, + + ackword: &'addrsp AtomicU32, +} + +#[derive(Debug)] +enum PageQueueEntry { + Free { + base: Frame, + phys_contiguous_count: Option, + }, + Other { + actions: TlbShootdownActions, + //page: Page, + }, +} + +#[derive(Debug)] +pub struct Flusher<'a, 'addrsp> { + active_cpus: &'a LogicalCpuSet, + state: FlusherState<'addrsp>, +} + +impl<'a, 'addrsp> Flusher<'a, 'addrsp> { + fn with_cpu_set(set: &'a LogicalCpuSet, ackword: &'addrsp AtomicU32) -> Self { + Self { + active_cpus: set, + state: FlusherState { + pagequeue: ArrayVec::new(), + dirty: false, + ackword, + }, + } + } + fn detach(mut self) -> FlusherState<'addrsp> { + static DUMMY: AtomicU32 = AtomicU32::new(0); + let state = core::mem::replace( + &mut self.state, + FlusherState { + pagequeue: ArrayVec::new(), + ackword: &DUMMY, + dirty: false, + }, + ); + core::mem::forget(self); + state + } + // NOTE: Lock must be held, which must be guaranteed by the caller. + pub fn flush(&mut self) { + let pages = core::mem::take(&mut self.state.pagequeue); + + #[expect(clippy::bool_comparison)] + if pages.is_empty() && core::mem::replace(&mut self.state.dirty, false) == false { + return; + } + + self.state.ackword.store(0, Ordering::SeqCst); + + let mut affected_cpu_count = 0; + + let current_cpu_id = crate::cpu_id(); + + for cpu_id in self.active_cpus.iter() { + if cpu_id == current_cpu_id { + continue; + } + + crate::percpu::shootdown_tlb_ipi(Some(cpu_id)); + + core::sync::atomic::fence(Ordering::SeqCst); + + if self.active_cpus.contains(cpu_id) { + affected_cpu_count += 1; + } + } + + if self.active_cpus.contains(current_cpu_id) { + rmm::PageFlushAll::::new().flush(); + } + + while self.state.ackword.load(Ordering::SeqCst) < affected_cpu_count { + PercpuBlock::current().maybe_handle_tlb_shootdown(); + core::hint::spin_loop(); + } + + for entry in pages { + match entry { + PageQueueEntry::Free { + base, + phys_contiguous_count, + } => { + handle_free_action(base, phys_contiguous_count); + } + PageQueueEntry::Other { actions } => { + // We currently invalidate everything on each flush + let _ = actions; + } + } + } + } +} +impl GenericFlusher for Flusher<'_, '_> { + fn queue( + &mut self, + frame: Frame, + phys_contiguous_count: Option, + actions: TlbShootdownActions, + ) { + let actions = actions & !TlbShootdownActions::NEW_MAPPING; + + let entry = if actions.contains(TlbShootdownActions::FREE) { + PageQueueEntry::Free { + base: frame, + phys_contiguous_count, + } + } else { + PageQueueEntry::Other { actions } + }; + self.state.dirty = true; + + if self.state.pagequeue.is_full() { + self.flush(); + } + self.state.pagequeue.push(entry); + } +} +impl Drop for Flusher<'_, '_> { + fn drop(&mut self) { + self.flush(); + } +} +bitflags::bitflags! { + #[derive(Debug)] + pub struct TlbShootdownActions: usize { + // Delay the deallocation of one or more contiguous frames. + const FREE = 1; + + // Revoke various access flags from a page + const REVOKE_READ = 1 << 1; + const REVOKE_WRITE = 1 << 2; + const REVOKE_EXEC = 1 << 3; + + // Unmap a page from one address space without deallocating it. + const MOVE = 1 << 4; + + // Add a new mapping to an address space. + // Not really a TLB shootdown action on most architectures, so almost always a no-op. + const NEW_MAPPING = 1 << 31; + } +} +impl TlbShootdownActions { + pub fn change_of_flags(old: PageFlags, new: PageFlags) -> Self { + let mut this = Self::empty(); + this.set(Self::REVOKE_WRITE, old.has_write() && !new.has_write()); + this.set(Self::REVOKE_EXEC, old.has_execute() && !new.has_execute()); + this + } +} diff --git a/src/context/mod.rs b/src/context/mod.rs new file mode 100644 index 0000000000..37c73f5a37 --- /dev/null +++ b/src/context/mod.rs @@ -0,0 +1,324 @@ +//! # Context management +//! +//! For resources on contexts, please consult [wikipedia](https://en.wikipedia.org/wiki/Context_switch) and [osdev](https://wiki.osdev.org/Context_Switching) + +use alloc::{ + collections::{BTreeSet, VecDeque}, + sync::{Arc, Weak}, +}; +use core::{num::NonZeroUsize, ops::Deref}; + +use crate::{ + context::memory::AddrSpaceWrapper, + cpu_set::LogicalCpuSet, + memory::{RmmA, RmmArch, TableKind}, + percpu::PercpuBlock, + sync::{ + ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard, + RwLockWriteGuard, L0, L1, L2, L4, + }, + syscall::error::Result, +}; + +use self::context::Kstack; +pub use self::{ + context::{BorrowedHtBuf, Context, Status}, + switch::switch, +}; + +pub type ContextLock = RwLock; +pub type ArcContextLockWriteGuard = ArcRwLockWriteGuard; + +#[cfg(target_arch = "aarch64")] +#[path = "arch/aarch64.rs"] +mod arch; + +#[cfg(target_arch = "x86")] +#[path = "arch/x86.rs"] +mod arch; + +#[cfg(target_arch = "x86_64")] +#[path = "arch/x86_64.rs"] +mod arch; + +#[cfg(target_arch = "riscv64")] +#[path = "arch/riscv64.rs"] +mod arch; + +/// Context struct +pub mod context; + +/// Context switch function +pub mod switch; + +/// File struct - defines a scheme and a file number +pub mod file; + +/// Memory struct - contains a set of pages for a context +pub mod memory; + +/// Signal handling +pub mod signal; + +/// Timeout handling +pub mod timeout; + +pub use self::switch::switch_finish_hook; + +/// Maximum context files +pub const CONTEXT_MAX_FILES: usize = 65_536; + +pub use self::arch::empty_cr3; + +// Set of weak references to all contexts available for scheduling. The only strong references are +// the context file descriptors. +static CONTEXTS: RwLock> = RwLock::new(BTreeSet::new()); + +// Actual context store for the scheduler +static RUN_CONTEXTS: Mutex = Mutex::new(RunContextData::new()); + +// Context that has been pushed out from RUN_CONTEXTS after being idle +static IDLE_CONTEXTS: Mutex> = Mutex::new(VecDeque::new()); + +pub struct RunContextData { + set: [VecDeque; 40], +} + +impl RunContextData { + pub const fn new() -> Self { + const EMPTY_VEC: VecDeque = VecDeque::new(); + Self { + set: [EMPTY_VEC; 40], + } + } +} + +/// Get the global schemes list, const +pub fn contexts(token: LockToken<'_, L1>) -> RwLockReadGuard<'_, L2, BTreeSet> { + CONTEXTS.read(token) +} + +/// Get per cpu contexts, mutable +pub fn contexts_mut(token: LockToken<'_, L1>) -> RwLockWriteGuard<'_, L2, BTreeSet> { + CONTEXTS.write(token) +} + +pub fn idle_contexts(token: LockToken<'_, L1>) -> MutexGuard<'_, L2, VecDeque> { + IDLE_CONTEXTS.lock(token) +} + +pub fn idle_contexts_try( + token: LockToken<'_, L1>, +) -> Option>> { + IDLE_CONTEXTS.try_lock(token) +} + +pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextData> { + RUN_CONTEXTS.lock(token) +} + +pub fn init(token: &mut CleanLockToken) { + let owner = None; // kmain not owned by any fd + let mut context = Context::new(owner).expect("failed to create kmain context"); + context.sched_affinity = LogicalCpuSet::empty(); + context.sched_affinity.atomic_set(crate::cpu_id()); + + context.name.clear(); + context.name.push_str("[kmain]"); + + self::arch::EMPTY_CR3.call_once(|| RmmA::table(TableKind::User)); + + context.status = Status::Runnable; + context.running = true; + context.cpu_id = Some(crate::cpu_id()); + + let context_lock = Arc::new(ContextLock::new(context)); + + let context_ref = ContextRef(Arc::clone(&context_lock)); + contexts_mut(token.token().downgrade()).insert(context_ref.clone()); + // Set this as current context and idle context, but don't treat it as regular context queue + unsafe { + let percpu = PercpuBlock::current(); + percpu + .switch_internals + .set_current_context(Arc::clone(&context_lock)); + percpu.switch_internals.set_idle_context(context_lock); + } +} + +pub fn current() -> Arc { + PercpuBlock::current() + .switch_internals + .with_context(Arc::clone) +} +pub fn try_current() -> Option> { + PercpuBlock::current() + .switch_internals + .try_with_context(|context| context.map(Arc::clone)) +} +pub fn is_current(context: &Arc) -> bool { + PercpuBlock::current() + .switch_internals + .with_context(|current| Arc::ptr_eq(context, current)) +} + +#[derive(Clone)] +pub struct ContextRef(pub Arc); +impl Deref for ContextRef { + type Target = Arc; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Ord for ContextRef { + fn cmp(&self, other: &Self) -> core::cmp::Ordering { + Ord::cmp(&Arc::as_ptr(&self.0), &Arc::as_ptr(&other.0)) + } +} +impl PartialOrd for ContextRef { + fn partial_cmp(&self, other: &Self) -> Option { + Some(Ord::cmp(self, other)) + } +} +impl PartialEq for ContextRef { + fn eq(&self, other: &Self) -> bool { + Ord::cmp(self, other) == core::cmp::Ordering::Equal + } +} +impl Eq for ContextRef {} + +#[derive(Clone)] +pub struct WeakContextRef(pub Weak); +impl WeakContextRef { + pub fn upgrade(&self) -> Option> { + self.0.upgrade() + } +} + +impl Ord for WeakContextRef { + fn cmp(&self, other: &Self) -> core::cmp::Ordering { + Ord::cmp(&Weak::as_ptr(&self.0), &Weak::as_ptr(&other.0)) + } +} +impl PartialOrd for WeakContextRef { + fn partial_cmp(&self, other: &Self) -> Option { + Some(Ord::cmp(self, other)) + } +} +impl PartialEq for WeakContextRef { + fn eq(&self, other: &Self) -> bool { + Ord::cmp(self, other) == core::cmp::Ordering::Equal + } +} +impl Eq for WeakContextRef {} + +/// Spawn a context from a function. +pub fn spawn( + userspace_allowed: bool, + owner_proc_id: Option, + func: extern "C" fn(), + token: &mut CleanLockToken, +) -> Result> { + let stack = Kstack::new()?; + + let mut context = Context::new(owner_proc_id)?; + + let _ = context.set_addr_space(Some(AddrSpaceWrapper::new()?), token.downgrade()); + context + .arch + .setup_initial_call(&stack, func, userspace_allowed); + + context.kstack = Some(stack); + context.userspace = userspace_allowed; + + let context_lock = Arc::new(ContextLock::new(context)); + let context_ref = ContextRef(Arc::clone(&context_lock)); + let run_ref = WeakContextRef(Arc::downgrade(&context_ref.0)); + idle_contexts(token.downgrade()).push_back(run_ref); + contexts_mut(token.downgrade()).insert(context_ref); + + Ok(context_lock) +} + +/// A guard that disables preemption for a context while it is alive. +/// +/// This guard is used to ensure that a sequence of operations is atomic with respect to preemption. +/// It automatically re-enables preemption when dropped. +/// +/// Because the guard must hold a mutable reference to the `CleanLockToken` to re-enable preemption +/// in `Drop`, it consumes the token. The `token()` method allows re-borrowing the token for use +/// within the guard's scope. +pub struct PreemptGuard<'a> { + context: &'a ContextLock, + token: &'a mut CleanLockToken, +} + +impl<'a> PreemptGuard<'a> { + pub fn new(context: &'a ContextLock, token: &'a mut CleanLockToken) -> PreemptGuard<'a> { + context.write(token.token()).preempt_locks += 1; + PreemptGuard { context, token } + } + + /// Get a mutable reference to the underlying `CleanLockToken`. + /// + /// This is necessary because the `PreemptGuard` owns the mutable reference to the token + /// (to use it in `Drop`), so we cannot use the original `token` variable while the guard exists. + pub fn token(&mut self) -> &mut CleanLockToken { + self.token + } +} + +impl Drop for PreemptGuard<'_> { + fn drop(&mut self) { + self.context.write(self.token.token()).preempt_locks -= 1; + } +} + +/// Variant of PreemptGuard behind a one-level token +pub struct PreemptGuardL1<'a> { + context: &'a ContextLock, + token: &'a mut LockToken<'a, L1>, +} + +impl<'a> PreemptGuardL1<'a> { + pub fn new(context: &'a ContextLock, token: &'a mut LockToken<'a, L1>) -> PreemptGuardL1<'a> { + context.write(token.token()).preempt_locks += 1; + PreemptGuardL1 { context, token } + } + + /// Get a mutable reference to the underlying `LockToken`. + pub fn token(&mut self) -> &mut LockToken<'a, L1> { + self.token + } +} + +impl Drop for PreemptGuardL1<'_> { + fn drop(&mut self) { + self.context.write(self.token.token()).preempt_locks -= 1; + } +} + +/// Variant of PreemptGuard behind a one-level token +pub struct PreemptGuardL2<'a> { + context: &'a ContextLock, + token: &'a mut LockToken<'a, L2>, +} + +impl<'a> PreemptGuardL2<'a> { + pub fn new(context: &'a ContextLock, token: &'a mut LockToken<'a, L2>) -> PreemptGuardL2<'a> { + context.write(token.token()).preempt_locks += 1; + PreemptGuardL2 { context, token } + } + + /// Get a mutable reference to the underlying `LockToken`. + pub fn token(&mut self) -> &mut LockToken<'a, L2> { + self.token + } +} + +impl Drop for PreemptGuardL2<'_> { + fn drop(&mut self) { + self.context.write(self.token.token()).preempt_locks -= 1; + } +} diff --git a/src/context/signal.rs b/src/context/signal.rs new file mode 100644 index 0000000000..19f4ebc01f --- /dev/null +++ b/src/context/signal.rs @@ -0,0 +1,105 @@ +use core::sync::atomic::Ordering; + +use crate::{context, sync::CleanLockToken, syscall::flag::SigcontrolFlags}; + +pub fn signal_handler(token: &mut CleanLockToken) { + let context_lock = context::current(); + let mut context_guard = context_lock.write(token.token()); + let context = &mut *context_guard; + + let being_sigkilled = context.being_sigkilled; + + if being_sigkilled { + drop(context_guard); + drop(context_lock); + crate::syscall::process::exit_this_context(None, token); + } + + /*let thumbs_down = ptrace::breakpoint_callback( + PTRACE_STOP_SIGNAL, + Some(ptrace_event!(PTRACE_STOP_SIGNAL)), + ) + .and_then(|_| ptrace::next_breakpoint().map(|f| f.contains(PTRACE_FLAG_IGNORE)));*/ + + // TODO: thumbs_down + let Some((thread_ctl, proc_ctl, st)) = context.sigcontrol() else { + // Discard signal if sigcontrol is unset. + trace!("no sigcontrol, returning"); + return; + }; + if thread_ctl.currently_pending_unblocked(proc_ctl) == 0 { + // The context is currently Runnable. When transitioning into Blocked, it will check for + // signals (with the context lock held, which is required when sending signals). After + // that, any detection of pending unblocked signals by the sender, will result in the + // context being unblocked, and signals sent. + + // TODO: prioritize signals over regular program execution + return; + } + let control_flags = + SigcontrolFlags::from_bits_retain(thread_ctl.control_flags.load(Ordering::Acquire)); + + if control_flags.contains(SigcontrolFlags::INHIBIT_DELIVERY) { + // Signals are inhibited to protect critical sections inside libc, but this code will run + // every time the context is switched to. + trace!("Inhibiting delivery, returning"); + return; + } + + let sigh_instr_ptr = st.user_handler.get(); + + let Some(regs) = context.regs_mut() else { + // TODO: is this even reachable? + trace!("No registers, returning"); + return; + }; + + let ip = regs.instr_pointer(); + let archdep_reg = regs.sig_archdep_reg(); + + regs.set_instr_pointer(sigh_instr_ptr); + + let (thread_ctl, _, _) = context + .sigcontrol() + .expect("cannot have been unset while holding the lock"); + + thread_ctl.saved_ip.set(ip); + thread_ctl.saved_archdep_reg.set(archdep_reg); + + thread_ctl.control_flags.store( + (control_flags | SigcontrolFlags::INHIBIT_DELIVERY).bits(), + Ordering::Release, + ); +} +pub fn excp_handler(excp: syscall::Exception) { + let mut token = unsafe { CleanLockToken::new() }; + + let current = context::current(); + + let context = current.write(token.token()); + + let Some(eh) = context.sig.as_ref().and_then(|s| s.excp_handler) else { + // TODO: Let procmgr print this? + info!( + "UNHANDLED EXCEPTION, CPU {}, PID {}, NAME {}, CONTEXT {current:p}", + crate::cpu_id(), + context.pid, + context.name + ); + drop(context); + // TODO: Allow exceptions to be caught by tracer etc, without necessarily exiting the + // context (closing files, dropping AddrSpace, etc) + crate::syscall::process::exit_this_context(Some(excp), &mut token); + }; + // TODO + /* + let Some(regs) = context.regs_mut() else { + // TODO: unhandled exception in this case too? + return; + }; + let old_ip = regs.instr_pointer(); + let old_archdep_reg = regs.ar + let (tctl, pctl, sigst) = context.sigcontrol().expect("already checked"); + tctl.saved_ip.set(excp.rsp); + tctl.saved_archdep_reg*/ +} diff --git a/src/context/switch.rs b/src/context/switch.rs new file mode 100644 index 0000000000..86684c8f4c --- /dev/null +++ b/src/context/switch.rs @@ -0,0 +1,577 @@ +//! This module provides a context-switching mechanism that utilizes a simple round-robin scheduler. +//! The scheduler iterates over available contexts, selecting the next context to run, while +//! handling process states and synchronization. + +use crate::{ + context::{ + self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard, + Context, ContextLock, WeakContextRef, + }, + cpu_set::LogicalCpuId, + cpu_stats::{self, CpuState}, + percpu::PercpuBlock, + sync::{ArcRwLockWriteGuard, CleanLockToken, L4}, +}; +use alloc::{sync::Arc, vec::Vec}; +use core::{ + cell::{Cell, RefCell}, + hint, mem, + sync::atomic::Ordering, +}; +use syscall::PtraceFlags; + +enum UpdateResult { + CanSwitch, + Skip, + Blocked, +} + +// A simple geometric series where value[i] ~= value[i - 1] * 1.25 +const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [ + 88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, 9548, 7620, 6100, 4904, + 3906, 3121, 2501, 1991, 1586, 1277, 1024, 820, 655, 526, 423, 335, 272, 215, 172, 137, 110, 87, + 70, 56, 45, 36, 29, 23, 18, 15, +]; + +/// Determines if a given context is eligible to be scheduled on a given CPU (in +/// principle, the current CPU). +/// +/// # Safety +/// This function is unsafe because it modifies the `context`'s state directly without synchronization. +/// +/// # Parameters +/// - `context`: The context (process/thread) to be checked. +/// - `cpu_id`: The logical ID of the CPU on which the context is being scheduled. +/// +/// # Returns +/// - `UpdateResult::CanSwitch`: If the context can be switched to. +/// - `UpdateResult::Skip`: If the context should be skipped (e.g., it's running on another CPU). +unsafe fn update_runnable( + context: &mut Context, + cpu_id: LogicalCpuId, + switch_time: u128, +) -> UpdateResult { + // Ignore contexts that are already running. + if context.running { + return UpdateResult::Skip; + } + + // Ignore contexts assigned to other CPUs. + if !context.sched_affinity.contains(cpu_id) { + return UpdateResult::Skip; + } + + // If context is soft-blocked and has a wake-up time, check if it should wake up. + if context.status.is_soft_blocked() + && let Some(wake) = context.wake + && switch_time >= wake + { + context.wake = None; + context.unblock_no_ipi(); + } + + // If the context is runnable, indicate it can be switched to. + if context.status.is_runnable() { + UpdateResult::CanSwitch + } else { + UpdateResult::Blocked + } +} + +struct SwitchResultInner { + _prev_guard: ArcContextLockWriteGuard, + _next_guard: ArcContextLockWriteGuard, +} + +/// Tick function to update PIT ticks and trigger a context switch if necessary. +/// +/// Called periodically, this function increments a per-CPU tick counter and performs a context +/// switch if the counter reaches a set threshold (e.g., every 3 ticks). +/// +/// The function also calls the signal handler after switching contexts. +pub fn tick(token: &mut CleanLockToken) { + let ticks_cell = &PercpuBlock::current().switch_internals.pit_ticks; + + let new_ticks = ticks_cell.get() + 1; + ticks_cell.set(new_ticks); + + // Trigger a context switch after every 3 ticks (approx. 6.75 ms). + if new_ticks >= 3 { + switch(token); + crate::context::signal::signal_handler(token); + } +} + +/// Finishes the context switch by clearing any temporary data and resetting the lock. +/// +/// This function is called after a context switch is completed to perform cleanup, including +/// clearing the switch result data and releasing the context switch lock. +/// +/// # Safety +/// This function involves unsafe operations such as resetting state and releasing locks. +pub unsafe extern "C" fn switch_finish_hook() { + unsafe { + match PercpuBlock::current().switch_internals.switch_result.take() { + Some(switch_result) => { + drop(switch_result); + } + _ => { + // TODO: unreachable_unchecked()? + crate::arch::stop::emergency_reset(); + } + } + arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); + crate::percpu::switch_arch_hook(); + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SwitchResult { + Switched, + AllContextsIdle, +} + +/// This function performs the context switch, using select_next_context to +/// actually select the next context to switch to. +/// +/// # Warning +/// This is not memory-unsafe to call. But do NOT call this while holding locks! +/// +/// # Returns +/// - `SwitchResult::Switched`: Indicates a successful switch to a new context. +/// - `SwitchResult::AllContextsIdle`: Indicates all contexts are idle, and the CPU will switch +/// to an idle context. +pub fn switch(token: &mut CleanLockToken) -> SwitchResult { + let switch_time = crate::time::monotonic(token); + + let percpu = PercpuBlock::current(); + cpu_stats::add_context_switch(); + + //set PIT Interrupt counter to 0, giving each process same amount of PIT ticks + percpu.switch_internals.pit_ticks.set(0); + + // Acquire the global lock to ensure exclusive access during context switch and avoid + // issues that would be caused by the unsafe operations below + // TODO: Better memory orderings? + while arch::CONTEXT_SWITCH_LOCK + .compare_exchange_weak(false, true, Ordering::SeqCst, Ordering::Relaxed) + .is_err() + { + hint::spin_loop(); + percpu.maybe_handle_tlb_shootdown(); + } + + // Lock the previous context. + let prev_context_lock = crate::context::current(); + // We are careful not to lock this context twice + let mut prev_context_guard = unsafe { prev_context_lock.write_arc() }; + + if !prev_context_guard.is_preemptable() { + // Unset global lock + arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); + + // Pretend to have finished switching, so CPU is not idled + return SwitchResult::Switched; + } + + // Alarm (previously in update_runnable) + let wakeups = wakeup_contexts(token, switch_time); + + if wakeups.len() > 0 { + let mut run_contexts = run_contexts(token.token()); + for (prio, context_lock) in wakeups { + run_contexts.set[prio].push_back(context_lock); + } + } + + let cpu_id = crate::cpu_id(); + + // Update per-cpu times + let percpu_nanos = switch_time.saturating_sub(percpu.switch_internals.switch_time.get()) as u64; + let percpu_ms = percpu_nanos / 1_000_000; + let was_idle = percpu.stats.add_time(percpu_ms) == CpuState::Idle as u8; + percpu.switch_internals.switch_time.set(switch_time); + + let switch_context_opt = match select_next_context( + token, + percpu, + cpu_id, + switch_time, + was_idle, + &mut prev_context_guard, + ) { + Ok(opt) => opt, + Err(early_ret) => return early_ret, + }; + + // Switch process states, TSS stack pointer, and store new context ID + match switch_context_opt { + Some(mut next_context_guard) => { + // Update context states and prepare for the switch. + let prev_context = &mut *prev_context_guard; + let next_context = &mut *next_context_guard; + + // Set the previous context as "not running" + prev_context.running = false; + + // Set the next context as "running" + next_context.running = true; + // Set the CPU ID for the next context + next_context.cpu_id = Some(cpu_id); + + // Update times + if !was_idle { + prev_context.cpu_time += switch_time.saturating_sub(prev_context.switch_time); + } + next_context.switch_time = switch_time; + if next_context.userspace { + percpu.stats.set_state(cpu_stats::CpuState::User); + } else { + percpu.stats.set_state(cpu_stats::CpuState::Kernel); + } + unsafe { + percpu.switch_internals.set_current_context(Arc::clone( + ArcContextLockWriteGuard::rwlock(&next_context_guard), + )); + } + + // FIXME set the switch result in arch::switch_to instead + let prev_context = unsafe { + mem::transmute::<&'_ mut Context, &'_ mut Context>(&mut *prev_context_guard) + }; + let next_context = unsafe { + mem::transmute::<&'_ mut Context, &'_ mut Context>(&mut *next_context_guard) + }; + + percpu + .switch_internals + .switch_result + .set(Some(SwitchResultInner { + _prev_guard: prev_context_guard, + _next_guard: next_context_guard, + })); + + /*let (ptrace_session, ptrace_flags) = if let Some((session, bp)) = ptrace::sessions() + .get(&next_context.pid) + .map(|s| (Arc::downgrade(s), s.data.lock().breakpoint)) + { + (Some(session), bp.map_or(PtraceFlags::empty(), |f| f.flags)) + } else { + (None, PtraceFlags::empty()) + };*/ + let ptrace_flags = PtraceFlags::empty(); + + //*percpu.ptrace_session.borrow_mut() = ptrace_session; + percpu.ptrace_flags.set(ptrace_flags); + prev_context.inside_syscall = + percpu.inside_syscall.replace(next_context.inside_syscall); + + #[cfg(feature = "syscall_debug")] + { + prev_context.syscall_debug_info = percpu + .syscall_debug_info + .replace(next_context.syscall_debug_info); + prev_context.syscall_debug_info.on_switch_from(token); + next_context.syscall_debug_info.on_switch_to(token); + } + + percpu + .switch_internals + .being_sigkilled + .set(next_context.being_sigkilled); + + unsafe { + arch::switch_to(prev_context, next_context); + } + + // NOTE: After switch_to is called, the return address can even be different from the + // current return address, meaning that we cannot use local variables here, and that we + // need to use the `switch_finish_hook` to be able to release the locks. Newly created + // contexts will return directly to the function pointer passed to context::spawn, and not + // reach this code until the next context switch back. + SwitchResult::Switched + } + _ => { + // No target was found, unset global lock and return + arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); + + percpu.stats.set_state(cpu_stats::CpuState::Idle); + + SwitchResult::AllContextsIdle + } + } +} + +fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize, WeakContextRef)> { + // TODO: Optimise this somehow. Perhaps using a separate timer queue? + let mut wakeups = Vec::new(); + let current_context = context::current(); + let Some(idle_contexts) = idle_contexts_try(token.downgrade()) else { + // other cpus may spawning or killing contexts so let's skip wakeups to avoid contention + return wakeups; + }; + let (mut idle_contexts, mut token) = idle_contexts.into_split(); + let len = idle_contexts.len(); + for _ in 0..len { + let Some(context_ref) = idle_contexts.pop_front() else { + break; + }; + let Some(context) = context_ref.upgrade() else { + continue; + }; + if Arc::ptr_eq(&context, ¤t_context) { + idle_contexts.push_back(context_ref); + continue; + } + let Some(guard) = context.try_read(token.token()) else { + idle_contexts.push_back(context_ref); + continue; + }; + if guard.status.is_soft_blocked() { + if let Some(wake) = guard.wake { + if switch_time >= wake { + let prio = guard.prio; + drop(guard); + wakeups.push((prio, context_ref)); + continue; + } + } + } + + if guard.status.is_runnable() && !guard.running { + let prio = guard.prio; + drop(guard); + wakeups.push((prio, context_ref)); + continue; + } + + drop(guard); + idle_contexts.push_back(context_ref); + } + wakeups +} + +/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler +fn select_next_context( + token: &mut CleanLockToken, + percpu: &PercpuBlock, + cpu_id: LogicalCpuId, + switch_time: u128, + was_idle: bool, + prev_context_guard: &mut ArcRwLockWriteGuard, +) -> Result, SwitchResult> { + let contexts_data = run_contexts(token.token()); + let (mut contexts_data, mut token) = contexts_data.into_split(); + let contexts_list = &mut contexts_data.set; + let idle_context = percpu.switch_internals.idle_context(); + let mut balance = percpu.balance.get(); + let mut i = percpu.last_queue.get() % 40; + + // Lock the previous context. + let prev_context_lock = crate::context::current(); + + let mut empty_queues = 0; + let mut total_iters = 0; + let mut next_context_guard_opt = None; + + let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum(); + let mut skipped_contexts = 0; + + 'priority: loop { + i = (i + 1) % 40; + total_iters += 1; + + // The least prioritised queue takes <5000 iters to build up + // balance = sched_prio_to_weight[20], if we have already spent + // that many iters and not found any context, it is better to just + // skip for now + if total_iters >= 5000 { + break 'priority; + } + + if skipped_contexts > total_contexts && total_contexts > 0 { + break 'priority; + } + + let contexts = contexts_list + .get_mut(i) + .expect("i should be between [0, 39]!"); + + if contexts.is_empty() { + empty_queues += 1; + if empty_queues >= 40 { + // If all queues are empty, just break out + break 'priority; + } + continue; + } else { + empty_queues = 0; + } + + if balance[i] < SCHED_PRIO_TO_WEIGHT[20] { + // This queue does not have enough balance to run, + // increment the balance! + balance[i] += SCHED_PRIO_TO_WEIGHT[i]; + continue; + } + + let len = contexts.len(); + for _ in 0..len { + let (next_context_ref, next_context_lock) = match contexts.pop_front() { + Some(lock) => match lock.upgrade() { + Some(new_lock) => (lock, new_lock), + None => { + skipped_contexts += 1; + continue; // Ghost Process, just continue + } + }, + None => break, // Empty Queue + }; + + if Arc::ptr_eq(&next_context_lock, &prev_context_lock) { + contexts.push_back(next_context_ref); + continue; + } + if Arc::ptr_eq(&next_context_lock, &idle_context) { + contexts.push_back(next_context_ref); + continue; + } + let mut next_context_guard = unsafe { next_context_lock.write_arc() }; + + // Is this context runnable on this CPU? + let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) }; + if let UpdateResult::CanSwitch = sw { + next_context_guard_opt = Some(next_context_guard); + balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; + break 'priority; + } else { + if matches!(sw, UpdateResult::Blocked) { + idle_contexts(token.token()).push_back(next_context_ref); + } else { + contexts.push_back(next_context_ref); + }; + skipped_contexts += 1; + + if skipped_contexts >= total_contexts { + break 'priority; + } + } + } + } + percpu.balance.set(balance); + percpu.last_queue.set(i); + + if !Arc::ptr_eq(&prev_context_lock, &idle_context) { + // Send the old process to the back of the line (if it is still runnable) + let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock)); + if prev_context_guard.status.is_runnable() { + let prio = prev_context_guard.prio; + contexts_list[prio].push_back(prev_ctx); + } else { + idle_contexts(token.token()).push_back(prev_ctx); + } + } + + if let Some(next_context_guard) = next_context_guard_opt { + // We found a new process! + return Ok(Some(next_context_guard)); + } else { + if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) { + // We switch into the idle context + Ok(Some(unsafe { idle_context.write_arc() })) + } else { + // We found no other process to run. + Ok(None) + } + } +} + +/// Holds per-CPU state necessary for context switching. +/// +/// This struct contains information such as the idle context, current context, and PIT tick counts, +/// as well as fields required for managing ptrace sessions and signals. +pub struct ContextSwitchPercpu { + switch_result: Cell>, + switch_time: Cell, + pit_ticks: Cell, + + current_ctxt: RefCell>>, + + /// The idle process. + idle_ctxt: RefCell>>, + pub(crate) being_sigkilled: Cell, +} + +impl ContextSwitchPercpu { + pub const fn default() -> Self { + Self { + switch_result: Cell::new(None), + switch_time: Cell::new(0), + pit_ticks: Cell::new(0), + current_ctxt: RefCell::new(None), + idle_ctxt: RefCell::new(None), + being_sigkilled: Cell::new(false), + } + } + + /// Applies a function to the current context, allowing controlled access. + /// + /// # Parameters + /// - `f`: A closure that receives a reference to the current context and returns a value. + /// + /// # Returns + /// The result of applying `f` to the current context. + pub fn with_context(&self, f: impl FnOnce(&Arc) -> T) -> T { + f(self + .current_ctxt + .borrow() + .as_ref() + .expect("not inside of context")) + } + + /// Applies a function to the current context, allowing controlled access. + /// + /// # Parameters + /// - `f`: A closure that receives a reference to the current context and returns a value. + /// + /// # Returns + /// The result of applying `f` to the current context if any. + pub fn try_with_context(&self, f: impl FnOnce(Option<&Arc>) -> T) -> T { + f(self.current_ctxt.borrow().as_ref()) + } + + /// Sets the current context to a new value. + /// + /// # Safety + /// This function is unsafe as it modifies the context state directly. + /// + /// # Parameters + /// - `new`: The new context to be set as the current context. + pub unsafe fn set_current_context(&self, new: Arc) { + *self.current_ctxt.borrow_mut() = Some(new); + } + + /// Sets the idle context to a new value. + /// + /// # Safety + /// This function is unsafe as it modifies the idle context state directly. + /// + /// # Parameters + /// - `new`: The new context to be set as the idle context. + pub unsafe fn set_idle_context(&self, new: Arc) { + *self.idle_ctxt.borrow_mut() = Some(new); + } + + /// Retrieves the current idle context. + /// + /// # Returns + /// A reference to the idle context. + pub fn idle_context(&self) -> Arc { + Arc::clone( + self.idle_ctxt + .borrow() + .as_ref() + .expect("no idle context present"), + ) + } +} diff --git a/src/context/timeout.rs b/src/context/timeout.rs new file mode 100644 index 0000000000..9bf5c99095 --- /dev/null +++ b/src/context/timeout.rs @@ -0,0 +1,82 @@ +use alloc::collections::VecDeque; + +use crate::{ + event, + scheme::SchemeId, + sync::{CleanLockToken, LockToken, Mutex, MutexGuard, L0, L1}, + syscall::{ + data::TimeSpec, + flag::{CLOCK_MONOTONIC, CLOCK_REALTIME, EVENT_READ}, + }, + time, +}; + +#[derive(Debug)] +struct Timeout { + pub scheme_id: SchemeId, + pub event_id: usize, + pub clock: usize, + pub time: u128, +} + +type Registry = VecDeque; + +static REGISTRY: Mutex = Mutex::new(Registry::new()); + +/// Get the global timeouts list +fn registry(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, Registry> { + REGISTRY.lock(token) +} + +pub fn register( + scheme_id: SchemeId, + event_id: usize, + clock: usize, + time: TimeSpec, + token: &mut CleanLockToken, +) { + let mut registry = registry(token.token()); + registry.push_back(Timeout { + scheme_id, + event_id, + clock, + time: (time.tv_sec as u128 * time::NANOS_PER_SEC) + (time.tv_nsec as u128), + }); +} + +pub fn trigger(token: &mut CleanLockToken) { + let mono = time::monotonic(token); + let real = time::realtime(token); + + let mut i = 0; + loop { + let mut registry = registry(token.token()); + let timeout = if i < registry.len() { + let trigger = match registry[i].clock { + CLOCK_MONOTONIC => { + let time = registry[i].time; + mono >= time + } + CLOCK_REALTIME => { + let time = registry[i].time; + real >= time + } + clock => { + println!("timeout::trigger: unknown clock {}", clock); + true + } + }; + + if trigger { + registry.remove(i).unwrap() + } else { + i += 1; + continue; + } + } else { + break; + }; + drop(registry); + event::trigger(timeout.scheme_id, timeout.event_id, EVENT_READ, token); + } +} diff --git a/src/cpu_set.rs b/src/cpu_set.rs new file mode 100644 index 0000000000..4aae7781e9 --- /dev/null +++ b/src/cpu_set.rs @@ -0,0 +1,134 @@ +use core::{ + fmt::Display, + sync::atomic::{AtomicUsize, Ordering}, +}; + +use crate::CPU_COUNT; + +/// A unique number used internally by the kernel to identify CPUs. +/// +/// This is usually but not necessarily the same as the APIC ID. +#[derive(Clone, Copy, Eq, PartialEq, Hash)] +// TODO: NonMaxUsize? +// TODO: Optimize away this type if not cfg!(feature = "multi_core") +pub struct LogicalCpuId(u32); + +impl LogicalCpuId { + pub const BSP: Self = Self::new(0); + + pub fn next() -> Self { + let id = CPU_COUNT.fetch_add(1, Ordering::Relaxed); + assert!(id < MAX_CPU_COUNT); + Self(id) + } + + pub const fn new(inner: u32) -> Self { + Self(inner) + } + pub const fn get(self) -> u32 { + self.0 + } +} + +impl core::fmt::Debug for LogicalCpuId { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "[logical cpu #{}]", self.0) + } +} +impl core::fmt::Display for LogicalCpuId { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "#{}", self.0) + } +} + +#[cfg(target_pointer_width = "64")] +pub const MAX_CPU_COUNT: u32 = 128; + +#[cfg(target_pointer_width = "32")] +pub const MAX_CPU_COUNT: u32 = 32; + +const SET_WORDS: usize = (MAX_CPU_COUNT / usize::BITS) as usize; + +// TODO: Support more than 128 CPUs. +// The maximum number of CPUs on Linux is configurable, and the type for LogicalCpuSet and +// LogicalCpuId may be optimized accordingly. In that case, box the mask if it's larger than some +// base size (probably 256 bytes). +#[derive(Debug)] +pub struct LogicalCpuSet([AtomicUsize; SET_WORDS]); + +fn parts(id: LogicalCpuId) -> (usize, u32) { + ((id.get() / usize::BITS) as usize, id.get() % usize::BITS) +} +impl LogicalCpuSet { + pub const fn empty() -> Self { + Self([const { AtomicUsize::new(0) }; SET_WORDS]) + } + + pub const fn all() -> Self { + Self([const { AtomicUsize::new(!0) }; SET_WORDS]) + } + + pub fn contains(&self, id: LogicalCpuId) -> bool { + let (word, bit) = parts(id); + self.0[word].load(Ordering::Acquire) & (1 << bit) != 0 + } + + pub fn atomic_set(&self, id: LogicalCpuId) { + let (word, bit) = parts(id); + let _ = self.0[word].fetch_or(1 << bit, Ordering::Release); + } + + pub fn atomic_clear(&self, id: LogicalCpuId) { + let (word, bit) = parts(id); + let _ = self.0[word].fetch_and(!(1 << bit), Ordering::Release); + } + + pub fn override_from(&mut self, raw: &RawMask) { + self.0 = raw.map(AtomicUsize::new); + } + + pub fn to_raw(&self) -> RawMask { + self.0.each_ref().map(|w| w.load(Ordering::Acquire)) + } + + pub fn iter(&self) -> impl Iterator + '_ { + self.0.iter().enumerate().flat_map(move |(i, w)| { + (0..usize::BITS).filter_map(move |b| { + if w.load(Ordering::Acquire) & (1 << b) != 0 { + Some(LogicalCpuId::new(i as u32 * usize::BITS + b)) + } else { + None + } + }) + }) + } +} + +impl Display for LogicalCpuSet { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let cpu_count = crate::cpu_count(); + + let raw = self.to_raw(); + let words = raw + .get(..cpu_count.div_ceil(usize::BITS) as usize) + .unwrap_or(&[]); + for (i, word) in words.iter().enumerate() { + if i != 0 { + write!(f, "_")?; + } + let word = if i == words.len() - 1 { + *word & ((1_usize << (cpu_count % usize::BITS)) - 1) + } else { + *word + }; + write!(f, "{word:X}")?; + } + Ok(()) + } +} + +pub type RawMask = [usize; SET_WORDS]; + +pub fn mask_as_bytes(mask: &RawMask) -> &[u8] { + unsafe { core::slice::from_raw_parts(mask.as_ptr().cast(), size_of::()) } +} diff --git a/src/cpu_stats.rs b/src/cpu_stats.rs new file mode 100644 index 0000000000..02e029df67 --- /dev/null +++ b/src/cpu_stats.rs @@ -0,0 +1,163 @@ +use alloc::vec::Vec; +use core::{ + fmt, + sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, +}; + +// Note: Using AtomicUsize rather than AtomicU64 as 32bit x86 doesn't support the latter +/// The number of times (overall) where a CPU switched from one context to another. +static CONTEXT_SWITCH_COUNT: AtomicUsize = AtomicUsize::new(0); +/// Number of times each Interrupt happened. +static IRQ_COUNT: [AtomicUsize; 256] = [const { AtomicUsize::new(0) }; 256]; +/// Number of contexts that were created. +static CONTEXTS_COUNT: AtomicUsize = AtomicUsize::new(0); + +/// Current state of a CPU +#[repr(u8)] +#[derive(Copy, Clone, Debug, Default)] +pub enum CpuState { + /// Waiting for runnable context + #[default] + Idle = 0, + /// Running a kernel context + Kernel = 1, + /// Running a context in the userspace + User = 2, +} + +/// Statistics for the CPUs. +#[derive(Debug, Default)] +pub struct CpuStats { + /// Number of ticks spent on userspace contexts + user: AtomicU64, + /// Number of ticks spent on Niced userspace contexts + nice: AtomicU64, + /// Number of ticks spent on kernel contexts + kernel: AtomicU64, + /// Number of ticks spent idle + idle: AtomicU64, + /// Number of times the CPU handled an interrupt + irq: AtomicU64, + /// Current state of the CPU + state: AtomicU8, +} + +impl CpuStats { + pub const fn default() -> Self { + Self { + user: AtomicU64::new(0), + nice: AtomicU64::new(0), + kernel: AtomicU64::new(0), + idle: AtomicU64::new(0), + irq: AtomicU64::new(0), + state: AtomicU8::new(0), + } + } +} + +pub struct CpuStatsData { + /// Number of ticks spent on userspace contexts + pub user: u64, + /// Number of ticks spent on Niced userspace contexts + pub nice: u64, + /// Number of ticks spent on kernel contexts + pub kernel: u64, + /// Number of ticks spent idle + pub idle: u64, + /// Number of times the CPU handled an interrupt + pub irq: u64, +} + +impl CpuStats { + /// Set the CPU's current state + /// + /// # Parameters + /// * `new_state` - The state of the CPU for the following ticks. + #[inline] + pub fn set_state(&self, new_state: CpuState) { + self.state.store(new_state as u8, Ordering::Relaxed); + } + + /// Increments time statistics of a CPU, return the state is was accounting to. + /// + /// Which statistic is incremented depends on the [`State`] of the CPU. + /// + /// # Parameters + /// * `nanos` - Number of nanoseconds to add. + #[inline] + pub fn add_time(&self, nanos: u64) -> u8 { + let state = self.state.load(Ordering::Relaxed); + match state { + val if val == CpuState::Idle as u8 => self.idle.fetch_add(nanos, Ordering::Relaxed), + val if val == CpuState::User as u8 => self.user.fetch_add(nanos, Ordering::Relaxed), + val if val == CpuState::Kernel as u8 => self.kernel.fetch_add(nanos, Ordering::Relaxed), + _ => unreachable!("all possible values are covered"), + }; + state + } + + /// Add an IRQ event to both the global count and the CPU that handled it. + /// + /// This should be called in all [`crate::arch::interrupt:irq::eoi`], + /// for all architectures. + /// + /// # Parameters + /// * `irq` - The ID of the interrupt that happened. + #[inline] + pub fn add_irq(&self, irq: u8) { + IRQ_COUNT[irq as usize].fetch_add(1, Ordering::Relaxed); + self.irq.fetch_add(1, Ordering::Relaxed); + } +} + +impl fmt::Display for CpuStatsData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{} {} {} {} {}", + self.user, self.nice, self.kernel, self.idle, self.irq, + ) + } +} + +impl From<&CpuStats> for CpuStatsData { + fn from(val: &CpuStats) -> Self { + CpuStatsData { + user: val.user.load(Ordering::Relaxed), + nice: val.nice.load(Ordering::Relaxed), + kernel: val.kernel.load(Ordering::Relaxed), + idle: val.idle.load(Ordering::Relaxed), + irq: val.irq.load(Ordering::Relaxed), + } + } +} + +/// Add a context switch to the count. +#[inline] +pub fn add_context_switch() { + CONTEXT_SWITCH_COUNT.fetch_add(1, Ordering::Relaxed); +} + +/// Get the number of context switches. +pub fn get_context_switch_count() -> usize { + CONTEXT_SWITCH_COUNT.load(Ordering::Relaxed) +} + +/// Add a context creation to the count. +#[inline] +pub fn add_context() { + CONTEXTS_COUNT.fetch_add(1, Ordering::Relaxed); +} + +/// Get the number of contexts created. +pub fn get_contexts_count() -> usize { + CONTEXTS_COUNT.load(Ordering::Relaxed) +} + +/// Get the count of each interrupt. +pub fn irq_counts() -> Vec { + IRQ_COUNT + .iter() + .map(|count| count.load(Ordering::Relaxed)) + .collect() +} diff --git a/src/debugger.rs b/src/debugger.rs new file mode 100644 index 0000000000..60e59737d5 --- /dev/null +++ b/src/debugger.rs @@ -0,0 +1,366 @@ +use crate::{ + context::{context::SyscallFrame, contexts, Context, ContextLock}, + memory::{ + get_page_info, the_zeroed_frame, Frame, RefCount, RmmA, RmmArch, TableKind, PAGE_SIZE, + }, + sync::CleanLockToken, +}; +use alloc::sync::Arc; +use hashbrown::{HashMap, HashSet}; + +/// Super unsafe due to page table switching and raw pointers! +pub unsafe fn debugger(target_id: Option<*const ContextLock>, token: &mut CleanLockToken) { + println!("DEBUGGER START"); + println!(); + + let mut tree = HashMap::new(); + let mut spaces = HashSet::new(); + + tree.insert(the_zeroed_frame().0, (1, false)); + + let old_table = RmmA::table(TableKind::User); + + { + let mut contexts = contexts(token.downgrade()); + let (contexts, mut token) = contexts.token_split(); + for context_arc in contexts.iter() { + if target_id.map_or(false, |target_id| Arc::as_ptr(&context_arc) != target_id) { + continue; + } + let context = context_arc.read(token.token()); + println!("{:p}: {}", Arc::as_ptr(&context_arc), context.name); + + let mut mark_frame_use = |frame| { + tree.entry(frame).or_insert((0, false)).0 += 1; + }; + + match &context.syscall_head { + SyscallFrame::Free(head) => mark_frame_use(head.get()), + SyscallFrame::Used { _frame: head } => mark_frame_use(*head), + SyscallFrame::Dummy => {} + } + match &context.syscall_tail { + SyscallFrame::Free(tail) => mark_frame_use(tail.get()), + SyscallFrame::Used { _frame: tail } => mark_frame_use(*tail), + SyscallFrame::Dummy => {} + } + + if let Some(sig) = &context.sig { + mark_frame_use(sig.proc_control.get()); + mark_frame_use(sig.thread_control.get()); + } + + // TODO: Lock ordering violation + let mut token = unsafe { CleanLockToken::new() }; + + // Switch to context page table to ensure syscall debug and stack dump will work + if let Some(ref space) = context.addr_space { + let was_new = spaces.insert( + space + .acquire_read(token.downgrade()) + .table + .utable + .table() + .phys() + .data(), + ); + unsafe { + RmmA::set_table( + TableKind::User, + space + .acquire_read(token.downgrade()) + .table + .utable + .table() + .phys(), + ); + #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] + check_page_table_consistency( + &mut space.acquire_write(token.downgrade()), + was_new, + &mut tree, + ); + } + } + + println!("status: {:?}", context.status); + if !context.status_reason.is_empty() { + println!("reason: {}", context.status_reason); + } + if let Some([a, b, c, d, e, f, g]) = context.current_syscall() { + println!( + "syscall: {}", + crate::syscall::debug::format_call(a, b, c, d, e, f, g) + ); + } + if let Some(ref addr_space) = context.addr_space { + let addr_space = addr_space.acquire_read(token.downgrade()); + if !addr_space.grants.is_empty() { + println!("grants:"); + for (base, info) in addr_space.grants.iter() { + let size = info.page_count() * PAGE_SIZE; + + let flags = format_args!( + "{}{}{}{}", + if info.flags().has_user() { "u" } else { "k" }, + if info.flags().has_present() { "r" } else { "-" }, + if info.flags().has_write() { "w" } else { "-" }, + if info.flags().has_execute() { "x" } else { "-" }, + ); + + #[cfg(target_arch = "aarch64")] + println!( + " virt 0x{:016x}:0x{:016x} {} size 0x{:08x} {:?}", + base.start_address().data(), + base.next_by(info.page_count() - 1).start_address().data() + 0xFFF, + flags, + size, + info.provider, + ); + + // FIXME riscv64 implementation + + #[cfg(target_arch = "x86")] + println!( + " virt 0x{:08x}:0x{:08x} {} size 0x{:08x} {:?}", + base.start_address().data(), + base.next_by(info.page_count()).start_address().data() + 0xFFF, + flags, + size, + info.provider, + ); + + #[cfg(target_arch = "x86_64")] + println!( + " virt 0x{:016x}:0x{:016x} {} size 0x{:08x} {:?}", + base.start_address().data(), + base.start_address().data() + size - 1, + flags, + size, + info.provider, + ); + } + } + } + if let Some(regs) = context.regs() { + println!("regs:"); + regs.dump(); + + #[cfg(target_arch = "aarch64")] + dump_stack(&*context, regs.iret.sp_el0); + + // FIXME riscv64 implementation + + #[cfg(target_arch = "x86")] + dump_stack(&*context, regs.iret.esp); + + #[cfg(target_arch = "x86_64")] + { + unsafe { + x86::bits64::rflags::stac(); + } + dump_stack(&*context, regs.iret.rsp); + unsafe { + x86::bits64::rflags::clac(); + } + } + } + + // Switch to original page table + unsafe { RmmA::set_table(TableKind::User, old_table) }; + + println!(); + } + } + #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] + crate::scheme::proc::foreach_addrsp(token, |addrsp, mut token| { + let was_new = spaces.insert( + addrsp + .acquire_read(token.downgrade()) + .table + .utable + .table() + .phys() + .data(), + ); + unsafe { + check_page_table_consistency( + &mut *addrsp.acquire_write(token.downgrade()), + was_new, + &mut tree, + ) + }; + }); + for (frame, (count, p)) in tree { + let Some(info) = get_page_info(frame) else { + assert!(p); + continue; + }; + let (c, s) = match info.refcount() { + None => (0, ""), + Some(RefCount::One) => (1, ""), + Some(RefCount::Cow(c)) => (c.get(), " cow"), + Some(RefCount::Shared(s)) => (s.get(), " shared"), + }; + if c != count { + println!( + "frame refcount mismatch for {:?} ({} != {}{})", + frame, c, count, s + ); + } + } + + println!("DEBUGGER END"); +} + +fn dump_stack(context: &Context, mut sp: usize) { + let width = size_of::(); + + println!("stack: {:>0width$x}", sp, width = width); + let mut token = unsafe { CleanLockToken::new() }; + //Maximum 64 usizes + for _ in 0..64 { + if context.addr_space.as_ref().map_or(false, |space| { + space + .acquire_read(token.downgrade()) + .table + .utable + .translate(crate::memory::VirtualAddress::new(sp)) + .is_some() + }) { + let value = unsafe { *(sp as *const usize) }; + println!(" {:>0width$x}: {:>0width$x}", sp, value, width = width); + if let Some(next_sp) = sp.checked_add(size_of::()) { + sp = next_sp; + } else { + println!(" {:>0width$x}: OVERFLOW", sp, width = width); + break; + } + } else { + println!(" {:>0width$x}: GUARD PAGE", sp, width = width); + break; + } + } +} + +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] +unsafe fn check_page_table_consistency( + addr_space: &mut crate::context::memory::AddrSpace, + new_as: bool, + tree: &mut HashMap, +) { + use crate::{ + context::memory::{PageSpan, Provider}, + memory::{get_page_info, RefCount}, + }; + + let p4 = addr_space.table.utable.table(); + + for p4i in 0..256 { + let p3 = match unsafe { p4.next(p4i) } { + Some(p3) => p3, + None => continue, + }; + + for p3i in 0..512 { + let p2 = match unsafe { p3.next(p3i) } { + Some(p2) => p2, + None => continue, + }; + + for p2i in 0..512 { + let p1 = match unsafe { p2.next(p2i) } { + Some(p1) => p1, + None => continue, + }; + + for p1i in 0..512 { + use crate::memory::Page; + use rmm::VirtualAddress; + + let (physaddr, flags) = match unsafe { p1.entry(p1i) } { + Some(e) => { + if let Ok(address) = e.address() { + (address, e.flags()) + } else { + continue; + } + } + _ => continue, + }; + let address = + VirtualAddress::new((p1i << 12) | (p2i << 21) | (p3i << 30) | (p4i << 39)); + + let (base, grant) = match addr_space + .grants + .contains(Page::containing_address(address)) + { + Some(g) => g, + None => { + error!( + "ADDRESS {:p} LACKING GRANT BUT MAPPED TO {:#0x} FLAGS {:?}!", + address.data() as *const u8, + physaddr.data(), + flags + ); + continue; + } + }; + + const EXCLUDE: usize = (1 << 5) | (1 << 6); // accessed+dirty+writable + if grant.flags().write(false).data() & !EXCLUDE + != flags.write(false).data() & !EXCLUDE + { + error!( + "FLAG MISMATCH: {:?} != {:?}, address {:p} in grant at {:?}", + grant.flags(), + flags, + address.data() as *const u8, + PageSpan::new(base, grant.page_count()) + ); + } + let p = matches!( + grant.provider, + Provider::PhysBorrowed { .. } + | Provider::External { .. } + | Provider::FmapBorrowed { .. } + ); + let frame = Frame::containing(physaddr); + if new_as { + tree.entry(frame).or_insert((0, p)).0 += 1; + } + + if let Some(page) = get_page_info(frame) { + match page.refcount() { + None => panic!("mapped page with zero refcount"), + + Some(RefCount::One | RefCount::Shared(_)) => assert!( + !(flags.has_write() && !grant.flags().has_write()), + "page entry has higher permissions than grant!" + ), + Some(RefCount::Cow(_)) => { + assert!(!flags.has_write(), "directly writable CoW page!") + } + } + } else { + //println!("!OWNED {:?}", frame); + } + } + } + } + } + + /*for (base, info) in addr_space.grants.iter() { + let span = PageSpan::new(base, info.page_count()); + for page in span.pages() { + let _entry = match addr_space.table.utable.translate(page.start_address()) { + Some(e) => e, + None => { + error!("GRANT AT {:?} LACKING MAPPING AT PAGE {:p}", span, page.start_address().data() as *const u8); + continue; + } + }; + } + }*/ + println!("Consistency appears correct"); +} diff --git a/src/devices/graphical_debug/debug.rs b/src/devices/graphical_debug/debug.rs new file mode 100644 index 0000000000..4b684c8a30 --- /dev/null +++ b/src/devices/graphical_debug/debug.rs @@ -0,0 +1,155 @@ +use core::ptr; + +pub(super) struct Display { + pub(super) width: usize, + pub(super) height: usize, + pub(super) stride: usize, + onscreen_ptr: *mut u32, +} + +unsafe impl Send for Display {} + +static FONT: &[u8] = include_bytes!("../../../res/unifont.font"); + +enum Mode { + Plain, + Esc, + Csi, +} + +pub struct DebugDisplay { + pub(super) display: Display, + x: usize, + y: usize, + w: usize, + h: usize, + mode: Mode, +} + +impl DebugDisplay { + pub(super) fn new( + width: usize, + height: usize, + stride: usize, + onscreen_ptr: *mut u32, + ) -> DebugDisplay { + let display = Display { + width, + height, + stride, + onscreen_ptr, + }; + + let w = display.width / 8; + let h = display.height / 16; + DebugDisplay { + display, + x: 0, + y: 0, + w, + h, + mode: Mode::Plain, + } + } + + pub fn write(&mut self, buf: &[u8]) { + for &b in buf { + if self.x >= self.w || b == b'\n' { + self.x = 0; + self.y = (self.y + 1) % self.h; + } + + if b == b'\r' { + self.x = 0; + } + + match (b, &self.mode) { + // Byte 0x1B starts ESC sequence + (0x1B, _) => { + self.mode = Mode::Esc; + continue; + } + // Ignore other nonprintable characters + (0x00..=0x1F | 0x80..=0xFF, _) => { + self.mode = Mode::Plain; + continue; + } + // '[' after ESC starts CSI sequence + (b'[', Mode::Esc) => { + self.mode = Mode::Csi; + continue; + } + // Capture any bytes after ESC + (_, Mode::Esc) => { + self.mode = Mode::Plain; + continue; + } + // Byte 0x40 to 0x7E ends CSI + (0x40..=0x7E, Mode::Csi) => { + self.mode = Mode::Plain; + continue; + } + // Capture any bytes after CSI + (_, Mode::Csi) => { + continue; + } + // Allow any other bytes + (_, Mode::Plain) => {} + } + + if self.x == 0 { + self.clear_row(self.y); + self.clear_row((self.y + 1) % self.h); + } + + self.char(self.x * 8, self.y * 16, b as char, 0xFFFFFF); + + self.x += 1; + } + } + + fn clear_row(&mut self, y: usize) { + for row in y * 16..(y + 1) * 16 { + unsafe { + ptr::write_bytes( + self.display.onscreen_ptr.add(row * self.display.stride), + 0, + self.display.width, + ); + } + } + } + + /// Draw a character + fn char(&mut self, x: usize, y: usize, character: char, color: u32) { + if x + 8 <= self.display.width && y + 16 <= self.display.height { + let phys_y = y % self.display.height; + let mut dst = unsafe { + self.display + .onscreen_ptr + .add(phys_y * self.display.stride + x) + }; + + let font_i = 16 * (character as usize); + if font_i + 16 <= FONT.len() { + for row in 0..16 { + let row_data = FONT[font_i + row]; + for col in 0..8 { + if (row_data >> (7 - col)) & 1 == 1 { + unsafe { + *dst.add(col) = color; + } + } + } + + let next_phys_y = (phys_y + row + 1) % self.display.height; + dst = unsafe { + self.display + .onscreen_ptr + .add(next_phys_y * self.display.stride + x) + }; + } + } + } + } +} diff --git a/src/devices/graphical_debug/mod.rs b/src/devices/graphical_debug/mod.rs new file mode 100644 index 0000000000..b701c9a821 --- /dev/null +++ b/src/devices/graphical_debug/mod.rs @@ -0,0 +1,70 @@ +use core::str; +use spin::Mutex; + +pub use self::debug::DebugDisplay; + +pub mod debug; + +pub static DEBUG_DISPLAY: Mutex> = Mutex::new(None); + +pub static FRAMEBUFFER: Mutex<(usize, usize, usize)> = Mutex::new((0, 0, 0)); + +#[allow(unused)] +pub fn init(env: &[u8]) { + println!("Starting graphical debug"); + + let mut phys = 0; + let mut virt = 0; + let mut width = 0; + let mut height = 0; + let mut stride = 0; + + //TODO: should errors be reported? + for line in str::from_utf8(env).unwrap_or("").lines() { + let mut parts = line.splitn(2, '='); + let name = parts.next().unwrap_or(""); + let value = parts.next().unwrap_or(""); + + if name == "FRAMEBUFFER_ADDR" { + phys = usize::from_str_radix(value, 16).unwrap_or(0); + } + + if name == "FRAMEBUFFER_VIRT" { + virt = usize::from_str_radix(value, 16).unwrap_or(0); + } + + if name == "FRAMEBUFFER_WIDTH" { + width = usize::from_str_radix(value, 16).unwrap_or(0); + } + + if name == "FRAMEBUFFER_HEIGHT" { + height = usize::from_str_radix(value, 16).unwrap_or(0); + } + + if name == "FRAMEBUFFER_STRIDE" { + stride = usize::from_str_radix(value, 16).unwrap_or(0); + } + } + + *FRAMEBUFFER.lock() = (phys, virt, stride * height * 4); + + if phys == 0 || virt == 0 || width == 0 || height == 0 || stride == 0 { + println!("Framebuffer not found"); + return; + } + + println!( + "Framebuffer {}x{} stride {} at {:X} mapped to {:X}", + width, height, stride, phys, virt + ); + + let debug_display = DebugDisplay::new(width, height, stride, virt as *mut u32); + *DEBUG_DISPLAY.lock() = Some(debug_display); +} + +#[allow(unused)] +pub fn fini() { + DEBUG_DISPLAY.lock().take(); + + println!("Finished graphical debug"); +} diff --git a/src/devices/mod.rs b/src/devices/mod.rs new file mode 100644 index 0000000000..44430d9df6 --- /dev/null +++ b/src/devices/mod.rs @@ -0,0 +1,4 @@ +pub mod graphical_debug; +pub mod serial; +pub mod uart_16550; +pub mod uart_pl011; diff --git a/src/devices/serial.rs b/src/devices/serial.rs new file mode 100644 index 0000000000..833aeca93f --- /dev/null +++ b/src/devices/serial.rs @@ -0,0 +1,72 @@ +use syscall::Mmio; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use syscall::Pio; + +use crate::{ + devices::{uart_16550, uart_pl011}, + scheme::debug::{debug_input, debug_notify}, + sync::CleanLockToken, +}; + +#[allow(dead_code)] +pub enum SerialKind { + NotPresent, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Ns16550Pio(uart_16550::SerialPort>), + Ns16550u8(&'static mut uart_16550::SerialPort>), + Ns16550u32(&'static mut uart_16550::SerialPort>), + Pl011(uart_pl011::SerialPort), +} + +impl SerialKind { + #[cfg(target_arch = "aarch64")] + pub fn enable_irq(&mut self) { + //TODO: implement for NS16550 + match self { + Self::NotPresent => {} + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Self::Ns16550Pio(_) => {} + Self::Ns16550u8(_) => {} + Self::Ns16550u32(_) => {} + Self::Pl011(inner) => inner.enable_irq(), + } + } + + pub fn receive(&mut self, token: &mut CleanLockToken) { + //TODO: make PL011 receive work the same way as NS16550 + match self { + Self::NotPresent => {} + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Self::Ns16550Pio(inner) => { + while let Some(c) = inner.receive() { + debug_input(c, token); + } + debug_notify(token); + } + Self::Ns16550u8(inner) => { + while let Some(c) = inner.receive() { + debug_input(c, token); + } + debug_notify(token); + } + Self::Ns16550u32(inner) => { + while let Some(c) = inner.receive() { + debug_input(c, token); + } + debug_notify(token); + } + Self::Pl011(inner) => inner.receive(token), + } + } + + pub fn write(&mut self, buf: &[u8]) { + match self { + Self::NotPresent => {} + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Self::Ns16550Pio(inner) => inner.write(buf), + Self::Ns16550u8(inner) => inner.write(buf), + Self::Ns16550u32(inner) => inner.write(buf), + Self::Pl011(inner) => inner.write(buf), + } + } +} diff --git a/src/devices/uart_16550.rs b/src/devices/uart_16550.rs new file mode 100644 index 0000000000..9b93e2c835 --- /dev/null +++ b/src/devices/uart_16550.rs @@ -0,0 +1,163 @@ +#![allow(unused)] + +use core::{ + convert::TryInto, + ptr::{addr_of, addr_of_mut}, +}; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use crate::syscall::io::Pio; +use crate::syscall::io::{Io, Mmio, ReadOnly}; + +bitflags! { + /// Interrupt enable flags + struct IntEnFlags: u8 { + const RECEIVED = 1; + const SENT = 1 << 1; + const ERRORED = 1 << 2; + const STATUS_CHANGE = 1 << 3; + // 4 to 7 are unused + } +} + +bitflags! { + /// Line status flags + struct LineStsFlags: u8 { + const INPUT_FULL = 1; + // 1 to 4 unknown + const OUTPUT_EMPTY = 1 << 5; + // 6 and 7 unknown + } +} + +#[allow(dead_code)] +#[repr(C, packed(4))] +pub struct SerialPort { + /// Data register, read to receive, write to send + data: T, + /// Interrupt enable + int_en: T, + /// FIFO control + fifo_ctrl: T, + /// Line control + line_ctrl: T, + /// Modem control + modem_ctrl: T, + /// Line status + line_sts: ReadOnly, + /// Modem status + modem_sts: ReadOnly, +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl SerialPort> { + pub const fn new(base: u16) -> SerialPort> { + SerialPort { + data: Pio::new(base), + int_en: Pio::new(base + 1), + fifo_ctrl: Pio::new(base + 2), + line_ctrl: Pio::new(base + 3), + modem_ctrl: Pio::new(base + 4), + line_sts: ReadOnly::new(Pio::new(base + 5)), + modem_sts: ReadOnly::new(Pio::new(base + 6)), + } + } +} + +impl SerialPort> { + #[allow(dead_code)] + pub unsafe fn new(base: usize) -> &'static mut SerialPort> { + unsafe { &mut *(base as *mut Self) } + } +} + +impl SerialPort> { + #[allow(dead_code)] + pub unsafe fn new(base: usize) -> &'static mut SerialPort> { + unsafe { &mut *(base as *mut Self) } + } +} + +impl SerialPort +where + T::Value: From + TryInto, +{ + pub fn init(&mut self) -> Result<(), ()> { + unsafe { + //TODO: Cleanup + // FIXME: Fix UB if unaligned + // Disable all interrupts + (*addr_of_mut!(self.int_en)).write(0x00.into()); + // Set baud rate divisor + (*addr_of_mut!(self.line_ctrl)).write(0x80.into()); + // Set divisor to 1 (115200 baud) + (*addr_of_mut!(self.data)).write(0x01.into()); + (*addr_of_mut!(self.int_en)).write(0x00.into()); + // Use 8 data bits, no parity, one stop bit + (*addr_of_mut!(self.line_ctrl)).write(0x03.into()); + // Enable and clear FIFOs with 14-byte threshold + (*addr_of_mut!(self.fifo_ctrl)).write(0xC7.into()); + + // Enable loopback + (*addr_of_mut!(self.modem_ctrl)).write(0x10.into()); + // Perform loopback test with even/odd pattern + for &byte in &[0x55, 0xAA] { + (*addr_of_mut!(self.data)).write(byte.into()); + if (*addr_of_mut!(self.data)).read() != byte.into() { + return Err(()); + } + } + + // Enable DTR, RTS, OUT1, and OUT2, disable loopback + (*addr_of_mut!(self.modem_ctrl)).write(0x0F.into()); + // Enable receive interrupt + (*addr_of_mut!(self.int_en)).write(0x01.into()); + } + + Ok(()) + } + + fn line_sts(&self) -> LineStsFlags { + LineStsFlags::from_bits_truncate( + (unsafe { &*addr_of!(self.line_sts) }.read() & 0xFF.into()) + .try_into() + .unwrap_or(0), + ) + } + + pub fn receive(&mut self) -> Option { + if self.line_sts().contains(LineStsFlags::INPUT_FULL) { + Some( + (unsafe { &*addr_of!(self.data) }.read() & 0xFF.into()) + .try_into() + .unwrap_or(0), + ) + } else { + None + } + } + + pub fn send(&mut self, data: u8) { + while !self.line_sts().contains(LineStsFlags::OUTPUT_EMPTY) {} + unsafe { &mut *addr_of_mut!(self.data) }.write(data.into()) + } + + pub fn write(&mut self, buf: &[u8]) { + for &b in buf { + match b { + 8 | 0x7F => { + self.send(8); + self.send(b' '); + self.send(8); + } + b'\n' => { + self.send(b'\r'); + self.send(b'\n'); + } + _ => { + self.send(b); + } + } + } + } +} diff --git a/src/devices/uart_pl011.rs b/src/devices/uart_pl011.rs new file mode 100644 index 0000000000..ba4cceb0ad --- /dev/null +++ b/src/devices/uart_pl011.rs @@ -0,0 +1,254 @@ +#![allow(unused)] + +use core::ptr; + +use crate::{ + scheme::debug::{debug_input, debug_notify}, + sync::CleanLockToken, +}; + +bitflags! { + /// UARTFR + #[derive(Clone, Copy, Debug)] + struct UartFrFlags: u32 { + const TXFE = 1 << 7; + const RXFF = 1 << 6; + const TXFF = 1 << 5; + const RXFE = 1 << 4; + const BUSY = 1 << 3; + } +} + +bitflags! { + /// UARTCR + #[derive(Clone, Copy, Debug)] + struct UartCrFlags: u32 { + const RXE = 1 << 9; + const TXE = 1 << 8; + const UARTEN = 1 << 0; + } +} + +bitflags! { + // UARTIMSC + #[derive(Clone, Copy, Debug)] + struct UartImscFlags: u32 { + const RTIM = 1 << 6; + const TXIM = 1 << 5; + const RXIM = 1 << 4; + } +} + +bitflags! { + // UARTICR + #[derive(Clone, Copy, Debug)] + struct UartIcrFlags: u32 { + const RTIC = 1 << 6; + const TXIC = 1 << 5; + const RXIC = 1 << 4; + } +} + +bitflags! { + // UARTRIS + #[derive(Clone, Copy, Debug)] + struct UartRisFlags: u32 { + const RTIS = 1 << 6; + const TXIS = 1 << 5; + const RXIS = 1 << 4; + } +} + +bitflags! { + //UARTMIS + #[derive(Clone, Copy, Debug)] + struct UartMisFlags: u32 { + const TXMIS = 1 << 5; + const RXMIS = 1 << 4; + } +} + +bitflags! { + //UARTLCR_H + #[derive(Clone, Copy, Debug)] + struct UartLcrhFlags: u32 { + const FEN = 1 << 4; + } +} + +bitflags! { + //UARTIFLS + #[derive(Clone, Copy, Debug)] + struct UartIflsFlags: u32 { + const RX1_8 = 0 << 3; + const RX2_8 = 1 << 3; + const RX4_8 = 2 << 3; + const RX6_8 = 3 << 3; + const RX7_8 = 4 << 3; + const TX1_8 = 0 << 0; + const TX2_8 = 1 << 0; + const TX4_8 = 2 << 0; + const TX6_8 = 3 << 0; + const TX7_8 = 4 << 0; + } +} + +pub struct SerialPort { + base: usize, + data_reg: u8, + rcv_stat_reg: u8, + flag_reg: u8, + int_baud_reg: u8, + frac_baud_reg: u8, + line_ctrl_reg: u8, + ctrl_reg: u8, + ifls_reg: u8, + intr_mask_setclr_reg: u8, + raw_intr_stat_reg: u8, + masked_intr_stat_reg: u8, + intr_clr_reg: u8, + dma_ctrl_reg: u8, + ifls: u32, + fifo_size: u32, + cts_event_walkaround: bool, +} + +impl SerialPort { + pub const fn new(base: usize, cts_event_walkaround: bool) -> SerialPort { + SerialPort { + base, + data_reg: 0x00, + rcv_stat_reg: 0x04, + flag_reg: 0x18, + int_baud_reg: 0x24, + frac_baud_reg: 0x28, + line_ctrl_reg: 0x2c, + ctrl_reg: 0x30, + ifls_reg: 0x34, + intr_mask_setclr_reg: 0x38, + raw_intr_stat_reg: 0x3c, + masked_intr_stat_reg: 0x40, + intr_clr_reg: 0x44, + dma_ctrl_reg: 0x48, + ifls: 0x12, // RX4_8 | TX4_8 + fifo_size: 32, + cts_event_walkaround, + } + } + + pub fn read_reg(&self, register: u8) -> u32 { + unsafe { ptr::read_volatile((self.base + register as usize) as *mut u32) } + } + + pub fn write_reg(&self, register: u8, data: u32) { + unsafe { + ptr::write_volatile((self.base + register as usize) as *mut u32, data); + } + } + + pub fn init(&mut self, with_irq: bool) { + //Disable UART first + self.write_reg(self.ctrl_reg, 0x0); + + //Setup ifls + self.write_reg(self.ifls_reg, self.ifls); + + //Enable FIFO + if self.fifo_size > 1 { + let mut flags = UartLcrhFlags::from_bits_truncate(self.read_reg(self.line_ctrl_reg)); + flags |= UartLcrhFlags::FEN; + self.write_reg(self.line_ctrl_reg, flags.bits()); + } + + // Enable RX, TX, UART + let flags = UartCrFlags::RXE | UartCrFlags::TXE | UartCrFlags::UARTEN; + self.write_reg(self.ctrl_reg, flags.bits()); + + if with_irq { + self.enable_irq(); + } + } + + fn line_sts(&self) -> UartFrFlags { + UartFrFlags::from_bits_truncate(self.read_reg(self.flag_reg)) + } + + fn intr_stats(&self) -> UartRisFlags { + UartRisFlags::from_bits_truncate(self.read_reg(self.raw_intr_stat_reg)) + } + + pub fn drain_fifo(&mut self) { + for _ in 0..self.fifo_size * 2 { + if self.line_sts().contains(UartFrFlags::RXFE) { + break; + } + let _ = self.read_reg(self.data_reg); + } + } + + pub fn receive(&mut self, token: &mut CleanLockToken) { + let mut flags = self.intr_stats(); + let chk_flags = UartRisFlags::RTIS | UartRisFlags::RXIS; + while (flags & chk_flags).bits() != 0 { + if self.cts_event_walkaround { + self.write_reg(self.intr_clr_reg, 0x00); + let _ = self.read_reg(self.intr_clr_reg); + let _ = self.read_reg(self.intr_clr_reg); + } + + let clr = flags & (!chk_flags); + self.write_reg(self.intr_clr_reg, clr.bits()); + + for _ in 0..256 { + if self.line_sts().contains(UartFrFlags::RXFE) { + break; + } + let c = self.read_reg(self.data_reg) as u8; + if c != 0 { + debug_input(c, token); + } + } + + flags = self.intr_stats(); + } + debug_notify(token); + } + + pub fn send(&mut self, data: u8) { + while !self.line_sts().contains(UartFrFlags::TXFE) {} + self.write_reg(self.data_reg, data as u32); + } + + pub fn clear_all_irqs(&mut self) { + let flags = UartIcrFlags::RTIC | UartIcrFlags::RXIC; + self.write_reg(self.intr_clr_reg, flags.bits()); + } + + pub fn enable_irq(&mut self) { + self.clear_all_irqs(); + + self.drain_fifo(); + + let flags = UartImscFlags::RXIM | UartImscFlags::RTIM; + self.write_reg(self.intr_mask_setclr_reg, flags.bits()); + } + + pub fn write(&mut self, buf: &[u8]) { + for &b in buf { + match b { + 8 | 0x7F => { + self.send(8); + self.send(b' '); + self.send(8); + } + b'\n' => { + self.send(b'\r'); + self.send(b'\n'); + } + _ => { + self.send(b); + } + } + } + } +} diff --git a/src/dtb/irqchip.rs b/src/dtb/irqchip.rs new file mode 100644 index 0000000000..110b51d169 --- /dev/null +++ b/src/dtb/irqchip.rs @@ -0,0 +1,422 @@ +use super::travel_interrupt_ctrl; +use crate::{ + arch::device::irqchip::new_irqchip, cpu_set::LogicalCpuId, scheme::irq::irq_trigger, + sync::CleanLockToken, +}; +use alloc::{boxed::Box, vec::Vec}; +use fdt::{node::NodeProperty, Fdt}; +use syscall::{Error, Result, EINVAL}; + +pub trait InterruptHandler { + fn irq_handler(&mut self, irq: u32, token: &mut CleanLockToken); +} + +#[derive(Debug, Copy, Clone)] +#[allow(dead_code)] +pub enum IrqCell { + L1(u32), + L2(u32, u32), + L3(u32, u32, u32), +} + +pub trait InterruptController: InterruptHandler { + fn irq_init( + &mut self, + fdt_opt: Option<&Fdt>, + irq_desc: &mut [IrqDesc; 1024], + ic_idx: usize, + irq_idx: &mut usize, + ) -> Result<()>; + fn irq_ack(&mut self) -> u32; + fn irq_eoi(&mut self, irq_num: u32); + fn irq_enable(&mut self, irq_num: u32); + #[allow(unused)] + fn irq_disable(&mut self, irq_num: u32); + fn irq_xlate(&self, irq_data: IrqCell) -> Result; + fn irq_to_virq(&self, hwirq: u32) -> Option; +} + +pub struct IrqConnection { + pub parent_phandle: u32, + pub parent: usize, // parent idx in chiplist + pub parent_interrupt: Option, +} + +pub struct IrqChipItem { + pub phandle: u32, + pub parents: Vec, + pub children: Vec, // child idx in chiplist + pub ic: Box, +} + +pub struct IrqChipList { + pub chips: Vec, +} + +pub struct IrqDescItem { + pub idx: usize, + pub ic_idx: usize, //ic idx in irq chip list + pub child_ic_idx: Option, //ic idx in irq chip list + pub ic_irq: u32, //hwirq in ic + pub used: bool, +} + +pub struct IrqDesc { + pub basic: IrqDescItem, + pub handler: Option>, +} + +impl IrqChipList { + fn init_inner1(&mut self, fdt: &Fdt) { + for node in fdt.all_nodes() { + if node.property("interrupt-controller").is_some() { + let Some(compatible) = node.compatible() else { + continue; + }; + let compatible = compatible.first(); + let Some(phandle) = node.property("phandle") else { + continue; + }; + let phandle = phandle.as_usize().unwrap() as u32; + let Some(intr_cells) = node.interrupt_cells() else { + continue; + }; + + debug!( + "{}, compatible = {}, #interrupt-cells = 0x{:08x}, phandle = 0x{:08x}", + node.name, compatible, intr_cells, phandle + ); + let mut item = IrqChipItem { + phandle, + parents: Vec::new(), + children: Vec::new(), + ic: new_irqchip(compatible).unwrap(), + }; + + fn interrupt_address( + iter: &mut impl Iterator, + interrupt_cells: usize, + ) -> Option { + match interrupt_cells { + 1 => Some(IrqCell::L1(iter.next()?)), + 2 if let Ok([a, b]) = iter.next_chunk() => Some(IrqCell::L2(a, b)), + 3 if let Ok([a, b, c]) = iter.next_chunk() => Some(IrqCell::L3(a, b, c)), + _ => None, + } + } + + fn gate_interrupt_address(addr: IrqCell) -> Option { + match addr { + IrqCell::L1(u32::MAX) + | IrqCell::L2(u32::MAX, _) + | IrqCell::L3(u32::MAX, _, _) => None, + _ => Some(addr), + } + } + + if let Some(parent) = node.interrupt_parent() + && let Some(intr_data) = node.property("interrupts") + { + // FIXME use interrupts() helper when fixed (see gh#12) + let mut intr_data = intr_data + .value + .as_chunks::<4>() + .0 + .iter() + .map(|&x| u32::from_be_bytes(x)); + let parent_phandle = parent + .property("phandle") + .and_then(NodeProperty::as_usize) + .unwrap() as u32; + let parent_interrupt_cells = parent.interrupt_cells().unwrap(); + debug!("interrupt-parent = 0x{:08x}", parent_phandle); + debug!("interrupts begin:"); + while let Some(parent_interrupt) = + interrupt_address(&mut intr_data, parent_interrupt_cells) + { + debug!("{:?}, ", parent_interrupt); + item.parents.push(IrqConnection { + parent_phandle, + parent: 0, + parent_interrupt: gate_interrupt_address(parent_interrupt), + }); + } + debug!("interrupts end"); + } else if let Some(intr_data) = node.property("interrupts-extended") { + // FIXME use the helper when fixed (see gh#37) + // Shouldn't matter much since ARM seems to not use extended interrupt and + // RISC-V seems to not use 3-sized interrupt addresses + let mut intr_data = intr_data + .value + .as_chunks::<4>() + .0 + .iter() + .map(|&x| u32::from_be_bytes(x)); + while let Some(parent_phandle) = intr_data.next() + && let Some(parent) = fdt.find_phandle(parent_phandle) + && let Some(parent_interrupt_cells) = parent.interrupt_cells() + && let Some(parent_interrupt) = + interrupt_address(&mut intr_data, parent_interrupt_cells) + { + debug!("{:?}, ", parent_interrupt); + item.parents.push(IrqConnection { + parent_phandle, + parent: 0, + parent_interrupt: gate_interrupt_address(parent_interrupt), + }); + } + } + + self.chips.push(item); + } + } + } + + fn init_inner2(&mut self) -> Vec { + let mut roots = Vec::new(); + + for child_i in 0..self.chips.len() { + let child = &mut self.chips[child_i]; + let phandle = child.phandle; + + if child.parents.is_empty() { + roots.push(child_i); + continue; + } + + for conn_i in 0..child.parents.len() { + let parent_phandle = self.chips[child_i].parents[conn_i].parent_phandle; + let parent_i = self + .chips + .iter() + .position(|x| parent_phandle == x.phandle) + .unwrap_or_else(|| { + panic!( + "Cannot find parent intc {} (connection from {})", + parent_phandle, phandle + ) + }); + self.chips[child_i].parents[conn_i].parent = parent_i; + let parent = &mut self.chips[parent_i]; + if !parent.children.contains(&child_i) { + parent.children.push(child_i); + } + } + } + roots + } + + fn init_inner3( + &mut self, + fdt_opt: Option<&Fdt>, + irq_desc: &mut [IrqDesc; 1024], + mut queue: Vec, + ) { + //run init + let mut irq_idx: usize = 0; + let mut queue_idx = 0; + while queue_idx < queue.len() { + let cur_idx = queue[queue_idx]; + let cur_chip = &mut self.chips[cur_idx]; + for child in &cur_chip.children { + if let Some(child_pos) = queue.iter().position(|x| *child == *x) { + assert!( + child_pos > queue_idx, + "IRQ chip tree has a cycle with phandle {} in it", + cur_chip.phandle + ); + } else { + queue.push(*child); + } + } + cur_chip + .ic + .irq_init(fdt_opt, irq_desc, cur_idx, &mut irq_idx) + .expect("Failed to initialize irq chip"); + + let cur_chip = &self.chips[cur_idx]; + for connection in &cur_chip.parents { + debug_assert!(queue[0..queue_idx].contains(&connection.parent)); + if let Some(parent_interrupt) = connection.parent_interrupt { + let parent = &self.chips[connection.parent]; + match parent.ic.irq_xlate(parent_interrupt) { + Ok(virq) => { + // assert is unused + irq_desc[virq].basic.child_ic_idx = Some(cur_idx); + } + _ => { + error!( + "Cannot connect irq chip {} to parent irq {} : {:?}", + cur_chip.phandle, parent.phandle, parent_interrupt + ); + } + } + } + } + + queue_idx += 1; + } + } +} + +pub struct IrqChipCore { + //TODO: support multi level interrupt constrollers + pub irq_chip_list: IrqChipList, + pub irq_desc: [IrqDesc; 1024], +} + +impl IrqChipCore { + pub fn irq_eoi(&mut self, virq: u32) { + let irq_desc = &self.irq_desc[virq as usize]; + let ic_idx = irq_desc.basic.ic_idx; + let hwirq = irq_desc.basic.ic_irq; + + self.irq_chip_list.chips[ic_idx].ic.irq_eoi(hwirq) + } + + pub fn irq_enable(&mut self, virq: u32) { + let irq_desc = &self.irq_desc[virq as usize]; + let ic_idx = irq_desc.basic.ic_idx; + let hwirq = irq_desc.basic.ic_irq; + + self.irq_chip_list.chips[ic_idx].ic.irq_enable(hwirq) + } + + #[allow(unused)] + pub fn irq_disable(&mut self, virq: u32) { + let irq_desc = &self.irq_desc[virq as usize]; + let ic_idx = irq_desc.basic.ic_idx; + let hwirq = irq_desc.basic.ic_irq; + + self.irq_chip_list.chips[ic_idx].ic.irq_disable(hwirq) + } + + #[cfg(target_arch = "riscv64")] + pub fn irq_to_virq(&self, ic_idx: usize, hwirq: u32) -> Option { + self.irq_chip_list.chips[ic_idx].ic.irq_to_virq(hwirq) + } + + pub fn irq_xlate(&self, ic_idx: usize, irq_data: &[u32]) -> Result { + let irq_data = match irq_data.len() { + 1 => IrqCell::L1(irq_data[0]), + 2 => IrqCell::L2(irq_data[0], irq_data[1]), + 3 => IrqCell::L3(irq_data[0], irq_data[1], irq_data[2]), + _ => return Err(Error::new(EINVAL)), + }; + self.irq_chip_list.chips[ic_idx].ic.irq_xlate(irq_data) + } + + pub fn trigger_virq(&mut self, virq: u32, token: &mut CleanLockToken) { + if virq < 1024 { + let desc = &mut self.irq_desc[virq as usize]; + match &mut desc.handler { + Some(handler) => { + handler.irq_handler(virq, token); + } + _ => { + if let Some(ic_idx) = desc.basic.child_ic_idx { + self.irq_chip_list.chips[ic_idx].ic.irq_handler(virq, token); + } else { + irq_trigger(virq as u8, token); + } + } + } + } + } + + pub fn init(&mut self, fdt_opt: Option<&Fdt>) { + for (i, desc) in self.irq_desc.iter_mut().enumerate() { + desc.basic.idx = i; + } + if let Some(fdt) = fdt_opt { + self.irq_chip_list.init_inner1(fdt); + } + let roots = self.irq_chip_list.init_inner2(); + self.irq_chip_list + .init_inner3(fdt_opt, &mut self.irq_desc, roots); + } + + pub fn phandle_to_ic_idx(&self, phandle: u32) -> Option { + self.irq_chip_list + .chips + .iter() + .position(|x| x.phandle == phandle) + } + + pub fn irq_iter_for(&self, ic_idx: u32) -> impl Iterator + '_ { + self.irq_desc.iter().filter_map(move |x| { + if x.basic.ic_idx == ic_idx as usize { + Some(x.basic.ic_irq as u8) + } else { + None + } + }) + } +} + +pub unsafe fn acknowledge(irq: usize) { + unsafe { + IRQ_CHIP.irq_eoi(irq as u32); + } +} + +const INIT_HANDLER: Option> = None; +const INIT_IRQ_DESC: IrqDesc = IrqDesc { + basic: IrqDescItem { + idx: 0, + ic_idx: 0, + ic_irq: 0, + child_ic_idx: None, + used: false, + }, + handler: INIT_HANDLER, +}; +pub static mut IRQ_CHIP: IrqChipCore = IrqChipCore { + irq_chip_list: IrqChipList { chips: Vec::new() }, + irq_desc: [INIT_IRQ_DESC; 1024], +}; + +pub fn init(fdt: &Fdt) { + travel_interrupt_ctrl(fdt); + unsafe { + IRQ_CHIP.init(Some(fdt)); + } +} + +pub fn register_irq(virq: u32, handler: Box) { + if virq >= 1024 { + error!("irq {} exceed 1024!!!", virq); + return; + } + + unsafe { + if IRQ_CHIP.irq_desc[virq as usize].handler.is_some() { + error!("irq {} has already been registered!", virq); + return; + } + + IRQ_CHIP.irq_desc[virq as usize].handler = Some(handler); + } +} + +#[inline] +pub fn is_reserved(_cpu_id: LogicalCpuId, index: u8) -> bool { + unsafe { IRQ_CHIP.irq_desc[index as usize].basic.used } +} + +#[inline] +pub fn set_reserved(_cpu_id: LogicalCpuId, index: u8, reserved: bool) { + unsafe { + IRQ_CHIP.irq_desc[index as usize].basic.used = reserved; + if reserved { + IRQ_CHIP.irq_enable(index as u32); + } else { + IRQ_CHIP.irq_enable(index as u32); + } + } +} + +pub fn available_irqs_iter(_cpu_id: LogicalCpuId) -> impl Iterator + 'static { + error!("available_irqs_iter has been called"); + 0..0 +} diff --git a/src/dtb/mod.rs b/src/dtb/mod.rs new file mode 100644 index 0000000000..b9b8b23094 --- /dev/null +++ b/src/dtb/mod.rs @@ -0,0 +1,246 @@ +#[cfg(dtb)] +pub mod irqchip; +pub mod serial; + +#[cfg(dtb)] +use crate::dtb::irqchip::IrqCell; +use crate::startup::memory::{register_memory_region, BootloaderMemoryKind}; +use core::slice; +use fdt::{ + node::{FdtNode, NodeProperty}, + standard_nodes::MemoryRegion, + Fdt, +}; +use rmm::PhysicalAddress; +use spin::once::Once; + +/// Represents the in-memory DTB (DeviceTree) binary. +pub static DTB_BINARY: Once<&'static [u8]> = Once::new(); + +/// Initializes the DTB from the provided base address and size. +/// +/// # Safety +/// +/// Caller must ensure the base address and size reference valid memory. +/// +/// The referenced memory must contain a valid DTB for the underlying system. +/// +/// The referenced memory must **not** be mutated for the duration of kernel run-time. +#[cfg_attr(not(dtb), expect(dead_code))] +pub unsafe fn init(dtb: Option<(usize, usize)>) { + let mut initialized = false; + DTB_BINARY.call_once(|| { + initialized = true; + + if let Some((dtb_base, dtb_size)) = dtb { + // SAFETY: `dtb_base` + `dtb_size` reference valid memory due to caller invariants + unsafe { slice::from_raw_parts(dtb_base as *const u8, dtb_size) } + } else { + &[] + } + }); + if !initialized { + println!("DTB_BINARY INIT TWICE!"); + } +} + +#[cfg_attr(not(dtb), expect(dead_code))] +pub fn travel_interrupt_ctrl(fdt: &Fdt) { + if let Some(root_intr_parent) = fdt + .root() + .property("interrupt-parent") + .and_then(NodeProperty::as_usize) + { + debug!("root parent = 0x{:08x}", root_intr_parent); + } + for node in fdt.all_nodes() { + if node.property("interrupt-controller").is_some() { + let Some(compatible) = node.property("compatible") else { + continue; + }; + let compatible = compatible.as_str().unwrap(); + let Some(phandle) = node.property("phandle") else { + continue; + }; + let phandle = phandle.as_usize().unwrap(); + if let Some(intr_cells) = node.interrupt_cells() { + let _intr = node + .property("interrupt-parent") + .and_then(NodeProperty::as_usize); + let _intr_data = node.property("interrupts"); + + debug!( + "{}, compatible = {}, #interrupt-cells = 0x{:08x}, phandle = 0x{:08x}", + node.name, compatible, intr_cells, phandle + ); + if let Some(intr) = _intr { + if let Some(intr_data) = _intr_data { + debug!("interrupt-parent = 0x{:08x}", intr); + debug!("interrupts begin:"); + for &chunk in intr_data.value.as_chunks::<4>().0 { + debug!("0x{:08x}, ", u32::from_be_bytes(chunk)); + } + } + debug!("interrupts end"); + } + } + } + } +} + +pub fn register_dev_memory_ranges(dt: &Fdt) { + if cfg!(target_arch = "aarch64") { + // work around for qemu-arm64 + // dev mem: 128MB - 1GB, see https://github.com/qemu/qemu/blob/master/hw/arm/virt.c for details + let root_node = dt.root(); + let is_qemu_virt = root_node.model().contains("linux,dummy-virt"); + + if is_qemu_virt { + register_memory_region(0x08000000, 0x08000000, BootloaderMemoryKind::Device); + register_memory_region(0x10000000, 0x30000000, BootloaderMemoryKind::Device); + return; + } + } + + let Some(soc_node) = dt.find_node("/soc") else { + warn!("failed to find /soc in devicetree"); + return; + }; + let Some(reg) = soc_node.ranges() else { + warn!("devicetree /soc has no ranges"); + return; + }; + for chunk in reg { + debug!( + "dev mem 0x{:08x} 0x{:08x} 0x{:08x} 0x{:08x}", + chunk.child_bus_address_hi, + chunk.child_bus_address, + chunk.parent_bus_address, + chunk.size + ); + + /*TODO: soc memory may contain all free memory! + register_memory_region( + chunk.parent_bus_address, + chunk.size, + BootloaderMemoryKind::Device, + );*/ + } + + // also add all soc-internal devices because they might not be shown in ranges + // (identity-mapped soc bus may have empty ranges) + for device in soc_node.children() { + if let Some(reg) = device.reg() { + for entry in reg { + if let Some(size) = entry.size { + let addr = entry.starting_address as usize; + if let Some(mapped_addr) = get_mmio_address(dt, &device, &entry) { + debug!( + "soc device {} 0x{:08x} -> 0x{:08x} size 0x{:08x}", + device.name, addr, mapped_addr, size + ); + register_memory_region(mapped_addr, size, BootloaderMemoryKind::Device); + } + } + } + } + } +} + +// FIXME return PhysicalAddress +pub fn get_mmio_address(fdt: &Fdt, _device: &FdtNode, region: &MemoryRegion) -> Option { + /* DT spec 2.3.8 "ranges": + * The ranges property provides a means of defining a mapping or translation between + * the address space of the bus (the child address space) and the address space of the bus + * node’s parent (the parent address space). + * If the property is defined with an value, it specifies that the parent and child + * address space is identical, and no address translation is required. + * If the property is not present in a bus node, it is assumed that no mapping exists between + * children of the node and the parent address space. + */ + + // FIXME assumes all the devices are connected to CPUs via the /soc bus + let mut mapped_addr = region.starting_address as usize; + let size = region.size.unwrap_or(0).saturating_sub(1); + let last_address = mapped_addr.saturating_add(size); + if let Some(parent) = fdt.find_node("/soc") { + let mut ranges = parent.ranges().map(|f| f.peekable())?; + if ranges.peek().is_some() { + let parent_range = ranges.find(|x| { + x.child_bus_address <= mapped_addr && last_address - x.child_bus_address <= x.size + })?; + mapped_addr = parent_range + .parent_bus_address + .checked_add(mapped_addr - parent_range.child_bus_address)?; + let _ = mapped_addr.checked_add(size)?; + } + } + Some(mapped_addr) +} + +#[cfg_attr(not(dtb), expect(dead_code))] +pub fn interrupt_parent<'a>(fdt: &'a Fdt, node: &'a FdtNode) -> Option> { + // FIXME traverse device tree up + node.interrupt_parent() + .or_else(|| fdt.find_node("/soc").and_then(|soc| soc.interrupt_parent())) + .or_else(|| fdt.find_node("/").and_then(|node| node.interrupt_parent())) +} + +#[cfg(dtb)] +pub fn get_interrupt(fdt: &Fdt, node: &FdtNode, idx: usize) -> Option { + let interrupts = node.property("interrupts").unwrap(); + let parent_interrupt_cells = interrupt_parent(fdt, node) + .unwrap() + .interrupt_cells() + .unwrap(); + let mut intr = interrupts + .value + .as_chunks::<4>() + .0 + .iter() + .map(|&f| u32::from_be_bytes(f)) + .skip(parent_interrupt_cells * idx); + match parent_interrupt_cells { + 1 => Some(IrqCell::L1(intr.next()?)), + 2 if let Ok([a, b]) = intr.next_chunk() => Some(IrqCell::L2(a, b)), + 3 if let Ok([a, b, c]) = intr.next_chunk() => Some(IrqCell::L3(a, b, c)), + _ => None, + } +} + +pub fn diag_uart_range<'a>(dtb: &'a Fdt) -> Option<(PhysicalAddress, usize, bool, bool, &'a str)> { + let stdout_path = dtb.chosen().stdout()?; + let uart_node = stdout_path.node(); + let skip_init = uart_node.property("skip-init").is_some(); + let cts_event_walkaround = uart_node.property("cts-event-walkaround").is_some(); + let compatible = uart_node + .property("compatible") + .and_then(NodeProperty::as_str)?; + + let mut reg = uart_node.reg()?; + let memory = reg.next()?; + let address = get_mmio_address(dtb, &uart_node, &memory)?; + + Some(( + PhysicalAddress::new(address), + memory.size?, + skip_init, + cts_event_walkaround, + compatible, + )) +} + +#[allow(unused)] +pub fn fill_env_data(dt: &Fdt, env_base: usize) -> usize { + if let Some(bootargs) = dt.chosen().bootargs() { + let bootargs_len = bootargs.len(); + + let env_base_slice = + unsafe { slice::from_raw_parts_mut(env_base as *mut u8, bootargs_len) }; + env_base_slice[..bootargs_len].clone_from_slice(bootargs.as_bytes()); + + bootargs_len + } else { + 0 + } +} diff --git a/src/dtb/serial.rs b/src/dtb/serial.rs new file mode 100644 index 0000000000..83a569d0ef --- /dev/null +++ b/src/dtb/serial.rs @@ -0,0 +1,69 @@ +use fdt::Fdt; +use spin::Mutex; +use syscall::Mmio; + +use crate::{ + devices::{serial::SerialKind, uart_16550, uart_pl011}, + dtb::diag_uart_range, + memory::{RmmA, RmmArch}, +}; + +pub static COM1: Mutex = Mutex::new(SerialKind::NotPresent); + +#[cfg_attr(not(dtb), expect(dead_code))] +pub unsafe fn init_early(dtb: &Fdt) { + unsafe { + if !matches!(*COM1.lock(), SerialKind::NotPresent) { + // Hardcoded UART + return; + } + + if let Some((phys, size, skip_init, cts, compatible)) = diag_uart_range(dtb) { + let virt = RmmA::phys_to_virt(phys).data(); + let serial_opt = if compatible.contains("arm,pl011") { + let mut serial_port = uart_pl011::SerialPort::new(virt, cts); + if !skip_init { + serial_port.init(false); + } + Some(SerialKind::Pl011(serial_port)) + } else if compatible.contains("ns16550a") { + if cfg!(target_arch = "riscv64") { + //TODO: get actual register size from device tree + let serial_port = uart_16550::SerialPort::>::new(virt); + if !skip_init { + let _ = serial_port.init(); + } + Some(SerialKind::Ns16550u8(serial_port)) + } else { + //TODO: get actual register size from device tree + let serial_port = uart_16550::SerialPort::>::new(virt); + if !skip_init { + let _ = serial_port.init(); + } + Some(SerialKind::Ns16550u32(serial_port)) + } + } else if compatible.contains("snps,dw-apb-uart") { + //TODO: get actual register size from device tree + let serial_port = uart_16550::SerialPort::>::new(virt); + if !skip_init { + let _ = serial_port.init(); + } + Some(SerialKind::Ns16550u32(serial_port)) + } else { + None + }; + match serial_opt { + Some(serial) => { + *COM1.lock() = serial; + info!("UART {:?} at {:#X} size {:#X}", compatible, virt, size); + } + None => { + warn!( + "UART {:?} at {:#X} size {:#X}: no driver found", + compatible, virt, size + ); + } + } + } + } +} diff --git a/src/event.rs b/src/event.rs new file mode 100644 index 0000000000..7398145ad6 --- /dev/null +++ b/src/event.rs @@ -0,0 +1,264 @@ +use alloc::sync::Arc; +use core::sync::atomic::{AtomicUsize, Ordering}; +use hashbrown::{hash_map::DefaultHashBuilder, HashMap}; +use smallvec::SmallVec; +use syscall::data::GlobalSchemes; + +use crate::{ + context, + scheme::{self, SchemeExt, SchemeId}, + sync::{ + CleanLockToken, LockToken, RwLock, RwLockReadGuard, RwLockWriteGuard, WaitQueue, L0, L1, L2, + }, + syscall::{ + data::Event, + error::{Error, Result, EBADF}, + flag::EventFlags, + usercopy::UserSliceWo, + }, +}; + +int_like!(EventQueueId, AtomicEventQueueId, usize, AtomicUsize); + +pub struct EventQueue { + id: EventQueueId, + queue: WaitQueue, +} + +impl EventQueue { + pub fn new(id: EventQueueId) -> EventQueue { + EventQueue { + id, + queue: WaitQueue::new(), + } + } + + pub fn is_currently_empty(&self, token: &mut CleanLockToken) -> bool { + self.queue.is_currently_empty(token) + } + + pub fn read(&self, buf: UserSliceWo, block: bool, token: &mut CleanLockToken) -> Result { + self.queue + .receive_into_user(buf, block, "EventQueue::read", token) + } + + pub fn write(&self, events: &[Event], token: &mut CleanLockToken) -> Result { + for event in events { + let file = { + let context_ref = context::current(); + let mut context = context_ref.read(token.token()); + let (context, mut token) = context.token_split(); + let files = context.files.read(token.token()); + match files.get(event.id).ok_or(Error::new(EBADF))? { + Some(file) => file.clone(), + None => return Err(Error::new(EBADF)), + } + }; + + let (scheme, number) = { + let description = file.description.read(token.token()); + (description.scheme, description.number) + }; + + if scheme == GlobalSchemes::Event.scheme_id() && number == self.id.into() { + // Do not allow recursively registering the same event queue + //TODO: should we also disallow event queues that contain this event queue? + return Err(Error::new(EBADF)); + } + + register( + RegKey { scheme, number }, + QueueKey { + queue: self.id, + id: event.id, + data: event.data, + }, + event.flags, + token, + ); + + let flags = sync(RegKey { scheme, number }, token)?; + if !flags.is_empty() { + trigger(scheme, number, flags, token); + } + } + + Ok(events.len()) + } + + pub fn into_drop(self, token: LockToken<'_, L1>) { + self.queue.condition.into_drop_locked(token); + } +} + +pub type EventQueueList = HashMap>; + +// Next queue id +static NEXT_QUEUE_ID: AtomicUsize = AtomicUsize::new(0); + +/// Get next queue id +pub fn next_queue_id() -> EventQueueId { + EventQueueId::from(NEXT_QUEUE_ID.fetch_add(1, Ordering::SeqCst)) +} + +// Current event queues +static QUEUES: RwLock = + RwLock::new(EventQueueList::with_hasher(DefaultHashBuilder::new())); + +/// Get the event queues list, const +pub fn queues(token: LockToken<'_, L0>) -> RwLockReadGuard<'_, L2, EventQueueList> { + QUEUES.read(token) +} + +/// Get the event queues list, mutable +pub fn queues_mut(token: LockToken<'_, L0>) -> RwLockWriteGuard<'_, L2, EventQueueList> { + QUEUES.write(token) +} + +#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct RegKey { + pub scheme: SchemeId, + pub number: usize, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct QueueKey { + pub queue: EventQueueId, + pub id: usize, + pub data: usize, +} + +type Registry = HashMap>; + +static REGISTRY: RwLock = + RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); + +pub fn register( + reg_key: RegKey, + queue_key: QueueKey, + flags: EventFlags, + token: &mut CleanLockToken, +) { + let mut registry = REGISTRY.write(token.token()); + + let entry = registry.entry(reg_key).or_default(); + + if flags.is_empty() { + entry.remove(&queue_key); + } else { + entry.insert(queue_key, flags); + } +} + +pub fn sync(reg_key: RegKey, token: &mut CleanLockToken) -> Result { + let mut flags = EventFlags::empty(); + + { + let registry = REGISTRY.read(token.token()); + if let Some(queue_list) = registry.get(®_key) { + for (_queue_key, &queue_flags) in queue_list.iter() { + flags |= queue_flags; + } + } + } + + let scheme = scheme::get_scheme(token.token(), reg_key.scheme)?; + + scheme.fevent(reg_key.number, flags, token) +} + +pub fn unregister_file(scheme: SchemeId, number: usize, token: &mut CleanLockToken) { + let mut registry = REGISTRY.write(token.token()); + registry.remove(&RegKey { scheme, number }); +} + +//TODO: Implement unregister_queue +// pub fn unregister_queue(scheme: SchemeId, number: usize) { +// +// } + +const MAX_EVENT: usize = 8; + +#[must_use] +fn trigger_inner( + scheme: SchemeId, + number: usize, + flags: EventFlags, + todo: &mut SmallVec<[EventQueueId; MAX_EVENT]>, + offset: &mut usize, + mut token: LockToken<'_, L1>, +) -> bool { + let mut matching_keys: SmallVec<[(QueueKey, EventFlags); MAX_EVENT]> = SmallVec::new(); + let mut full = false; + + { + let registry = REGISTRY.read(token.token()); + if let Some(queue_list) = registry.get(&RegKey { scheme, number }) { + for (queue_key, &queue_flags) in queue_list.iter().skip(*offset) { + let common_flags = flags & queue_flags; + if !common_flags.is_empty() { + if matching_keys.len() == matching_keys.inline_size() { + full = true; + break; + } + matching_keys.push((queue_key.clone(), common_flags)); + } + *offset += 1; + } + } + } + + while let Some((queue_key, common_flags)) = matching_keys.pop() { + let Some(queue) = QUEUES.read(token.token()).get(&queue_key.queue).cloned() else { + continue; + }; + + let event = Event { + id: queue_key.id, + flags: common_flags, + data: queue_key.data, + }; + + todo.push(queue_key.queue); + queue.queue.send_locked(event, token.token()); + if let Some(queue) = Arc::into_inner(queue) { + queue.into_drop(token.token()); + } + } + + full +} + +pub fn trigger(scheme: SchemeId, number: usize, flags: EventFlags, token: &mut CleanLockToken) { + trigger_locked(scheme, number, flags, token.token().downgrade()); +} + +pub fn trigger_locked( + scheme: SchemeId, + number: usize, + flags: EventFlags, + mut token: LockToken<'_, L1>, +) { + let mut todo = SmallVec::<[EventQueueId; MAX_EVENT]>::new(); + let mut done = SmallVec::<[EventQueueId; MAX_EVENT]>::new(); + + // First trigger with the original file + let mut offset = 0; + while trigger_inner(scheme, number, flags, &mut todo, &mut offset, token.token()) {} + + // Handle triggers on queues + while let Some(queue_id) = todo.pop() { + if let Err(insert_idx) = done.binary_search(&queue_id) { + done.insert(insert_idx, queue_id); + let mut offset = 0; + while trigger_inner( + GlobalSchemes::Event.scheme_id(), + queue_id.into(), + EventFlags::EVENT_READ, + &mut todo, + &mut offset, + token.token(), + ) {} + } + } +} diff --git a/src/log.rs b/src/log.rs new file mode 100644 index 0000000000..4be27d5aa3 --- /dev/null +++ b/src/log.rs @@ -0,0 +1,73 @@ +use alloc::collections::VecDeque; +use core::fmt; +use spin::{Mutex, MutexGuard}; + +use crate::devices::graphical_debug::{DebugDisplay, DEBUG_DISPLAY}; + +pub static LOG: Mutex> = Mutex::new(None); + +pub fn init() { + *LOG.lock() = Some(Log::new(1024 * 1024)); +} + +pub struct Log { + data: VecDeque, + size: usize, +} + +impl Log { + pub fn new(size: usize) -> Log { + Log { + data: VecDeque::with_capacity(size), + size, + } + } + + pub fn read(&self) -> (&[u8], &[u8]) { + self.data.as_slices() + } + + pub fn write(&mut self, buf: &[u8]) { + for &b in buf { + while self.data.len() + 1 >= self.size { + self.data.pop_front(); + } + self.data.push_back(b); + } + } +} + +pub struct Writer<'a> { + log: MutexGuard<'a, Option>, + display: MutexGuard<'a, Option>, + arch: crate::arch::debug::Writer<'a>, +} + +impl<'a> Writer<'a> { + pub fn new() -> Writer<'a> { + Writer { + log: LOG.lock(), + display: DEBUG_DISPLAY.lock(), + arch: crate::arch::debug::Writer::new(), + } + } + + pub fn write(&mut self, buf: &[u8], preserve: bool) { + if preserve && let Some(ref mut log) = *self.log { + log.write(buf); + } + + if let Some(display) = &mut *self.display { + display.write(buf); + } + + self.arch.write(buf); + } +} + +impl fmt::Write for Writer<'_> { + fn write_str(&mut self, s: &str) -> Result<(), fmt::Error> { + self.write(s.as_bytes(), true); + Ok(()) + } +} diff --git a/src/macros.rs b/src/macros.rs new file mode 100644 index 0000000000..2f81a3cb13 --- /dev/null +++ b/src/macros.rs @@ -0,0 +1,56 @@ +/// Print to console +#[macro_export] +macro_rules! print { + ($($arg:tt)*) => ({ + use core::fmt::Write; + let _ = write!($crate::log::Writer::new(), $($arg)*); + }); +} + +/// Print with new line to console +#[macro_export] +macro_rules! println { + ($($arg:tt)*) => ({ + use core::fmt::Write; + let _ = writeln!($crate::log::Writer::new(), $($arg)*); + }); +} + +#[macro_export] +macro_rules! error { + ($($arg:tt)*) => { + println!("{}:ERROR -- {}", core::module_path!(), format_args!($($arg)*)); + }; +} + +#[macro_export] +macro_rules! warn { + ($($arg:tt)*) => { + println!("{}:WARN -- {}", core::module_path!(), format_args!($($arg)*)); + }; +} + +#[macro_export] +macro_rules! info { + ($($arg:tt)*) => { + println!("{}:INFO -- {}", core::module_path!(), format_args!($($arg)*)); + }; +} + +#[macro_export] +macro_rules! debug { + ($($arg:tt)*) => { + if cfg!(any(target_arch = "aarch64", target_arch = "riscv64")) { + println!("{}:DEBUG -- {}", core::module_path!(), format_args!($($arg)*)); + } + }; +} + +#[macro_export] +macro_rules! trace { + ($($arg:tt)*) => { + if false { + println!("{}:TRACE -- {}", core::module_path!(), format_args!($($arg)*)); + } + }; +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000000..32f491d0e8 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,145 @@ +//! # The Redox OS Kernel, version 2 +//! +//! The Redox OS Kernel is a microkernel that supports `x86_64` systems and +//! provides Unix-like syscalls for primarily Rust applications + +#![feature(asm_cfg)] // Stabilized in 1.93 +#![feature(if_let_guard)] +#![feature(int_roundings)] +#![feature(iter_next_chunk)] +#![feature(sync_unsafe_cell)] +#![feature(btree_cursors)] +#![cfg_attr(not(test), no_std)] +#![cfg_attr(not(test), no_main)] +#![allow(clippy::new_without_default)] + +#[macro_use] +extern crate alloc; + +#[macro_use] +extern crate bitflags; + +use core::sync::atomic::{AtomicU32, Ordering}; + +#[macro_use] +/// Shared data structures +mod common; + +#[macro_use] +mod macros; + +/// Architecture-dependent stuff +#[macro_use] +#[allow(dead_code)] // TODO +mod arch; +use crate::arch::{consts::*, ipi, stop, CurrentRmmArch}; +/// Offset of physmap +#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), expect(dead_code))] +const PHYS_OFFSET: usize = ::PHYS_OFFSET; + +/// Heap allocators +mod allocator; + +/// ACPI table parsing +mod acpi; + +mod dtb; + +/// Logical CPU ID and bitset types +mod cpu_set; + +/// Stats for the CPUs +mod cpu_stats; + +/// Context management +mod context; + +/// Debugger +#[cfg(feature = "debugger")] +mod debugger; + +/// Architecture-independent devices +mod devices; + +/// Event handling +mod event; + +/// Logging +mod log; + +/// Memory management +mod memory; + +/// Panic +mod panic; + +mod percpu; + +/// Process tracing +mod ptrace; + +/// Performance profiling of the kernel +mod profiling; + +/// Schemes, filesystem handlers +mod scheme; + +/// Early init +mod startup; + +/// Synchronization primitives +mod sync; + +/// Syscall handlers +mod syscall; + +/// Time +mod time; + +#[cfg_attr(not(test), global_allocator)] +static ALLOCATOR: allocator::Allocator = allocator::Allocator; + +/// Get the current CPU's scheduling ID +#[inline(always)] +fn cpu_id() -> crate::cpu_set::LogicalCpuId { + crate::percpu::PercpuBlock::current().cpu_id +} + +/// The count of all CPUs that can have work scheduled +static CPU_COUNT: AtomicU32 = AtomicU32::new(1); + +/// Get the number of CPUs currently active +#[inline(always)] +fn cpu_count() -> u32 { + CPU_COUNT.load(Ordering::Relaxed) +} + +macro_rules! linker_offsets( + ($($name:ident),*) => { + $( + #[inline(always)] + #[allow(non_snake_case)] + pub fn $name() -> usize { + unsafe extern "C" { + // TODO: UnsafeCell? + static $name: u8; + } + (&raw const $name) as usize + } + )* + } +); +mod kernel_executable_offsets { + linker_offsets!( + KERNEL_OFFSET, + __text_start, + __text_end, + __rodata_start, + __rodata_end, + __usercopy_start, + __usercopy_end + ); + + #[cfg(target_arch = "x86_64")] + linker_offsets!(__altrelocs_start, __altrelocs_end); +} diff --git a/src/memory/kernel_mapper.rs b/src/memory/kernel_mapper.rs new file mode 100644 index 0000000000..2f7c8be611 --- /dev/null +++ b/src/memory/kernel_mapper.rs @@ -0,0 +1,91 @@ +use core::sync::{ + atomic, + atomic::{AtomicU32, AtomicUsize, Ordering}, +}; +use rmm::{PageMapper, TableKind}; + +const NO_PROCESSOR: u32 = !0; +static LOCK_OWNER: AtomicU32 = AtomicU32::new(NO_PROCESSOR); +static LOCK_COUNT: AtomicUsize = AtomicUsize::new(0); + +// TODO: Support, perhaps via const generics, embedding address checking in PageMapper, thereby +// statically enforcing that the kernel mapper can only map things in the kernel half, and vice +// versa. +/// A guard to the global lock protecting the upper 128 TiB of kernel address space. +/// +/// NOTE: Use this with great care! Since heap allocations may also require this lock when the heap +/// needs to be expended, it must not be held while memory allocations are done! +// TODO: Make the lock finer-grained so that e.g. the heap part can be independent from e.g. +// PHYS_PML4? +pub struct KernelMapper { + mapper: crate::memory::PageMapper, +} + +impl KernelMapper { + pub fn lock_ro() -> Self { + KernelMapper::lock() + } +} + +impl KernelMapper { + pub fn lock_rw() -> Self { + KernelMapper::lock() + } +} + +impl KernelMapper { + fn lock() -> Self { + let mapper = + unsafe { PageMapper::current(TableKind::Kernel, crate::memory::TheFrameAllocator) }; + + let current_processor = crate::cpu_id(); + loop { + match LOCK_OWNER.compare_exchange_weak( + NO_PROCESSOR, + current_processor.get(), + Ordering::Acquire, + Ordering::Relaxed, + ) { + Ok(_) => break, + // already owned by this hardware thread + Err(id) if id == current_processor.get() => break, + // either CAS failed, or some other hardware thread holds the lock + Err(_) => core::hint::spin_loop(), + } + } + + let prev_count = LOCK_COUNT.fetch_add(1, Ordering::Relaxed); + atomic::compiler_fence(Ordering::Acquire); + + let ro = prev_count > 0; + + if RW && ro { + panic!("KernelMapper locked re-entrant when write access is requested"); + } + + Self { mapper } + } +} + +impl core::ops::Deref for KernelMapper { + type Target = crate::memory::PageMapper; + + fn deref(&self) -> &Self::Target { + &self.mapper + } +} + +impl core::ops::DerefMut for KernelMapper { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.mapper + } +} + +impl Drop for KernelMapper { + fn drop(&mut self) { + if LOCK_COUNT.fetch_sub(1, Ordering::Relaxed) == 1 { + LOCK_OWNER.store(NO_PROCESSOR, Ordering::Release); + } + atomic::compiler_fence(Ordering::Release); + } +} diff --git a/src/memory/mod.rs b/src/memory/mod.rs new file mode 100644 index 0000000000..393ae7ebd9 --- /dev/null +++ b/src/memory/mod.rs @@ -0,0 +1,1090 @@ +//! # Memory management +//! Some code was borrowed from [Phil Opp's Blog](http://os.phil-opp.com/allocating-frames.html) + +use core::{ + cell::SyncUnsafeCell, + num::NonZeroUsize, + sync::atomic::{AtomicUsize, Ordering}, +}; + +pub use kernel_mapper::KernelMapper; +use spin::Mutex; + +pub use crate::arch::CurrentRmmArch as RmmA; +use crate::{ + context::{ + self, + memory::{AccessMode, PfError}, + }, + kernel_executable_offsets::{__usercopy_end, __usercopy_start}, + sync::CleanLockToken, + syscall::error::{Error, ENOMEM}, +}; +pub use rmm::{Arch as RmmArch, PageFlags, PhysicalAddress, TableKind, VirtualAddress}; +use rmm::{BumpAllocator, FrameAllocator, FrameCount, FrameUsage}; + +mod kernel_mapper; +pub mod page; +pub use page::*; + +pub type PageMapper = rmm::PageMapper; + +/// Available physical memory areas +pub(crate) static AREAS: SyncUnsafeCell<[rmm::MemoryArea; 512]> = SyncUnsafeCell::new( + [rmm::MemoryArea { + base: PhysicalAddress::new(0), + size: 0, + }; 512], +); +pub(crate) static AREA_COUNT: SyncUnsafeCell = SyncUnsafeCell::new(0); + +// TODO: Share code +pub(crate) fn areas() -> &'static [rmm::MemoryArea] { + // SAFETY: Both AREAS and AREA_COUNT are initialized once and then never changed. + // + // TODO: Memory hotplug? + unsafe { &(&*AREAS.get())[..AREA_COUNT.get().read().into()] } +} + +/// Get the number of frames available +pub fn free_frames() -> usize { + total_frames() - used_frames() +} + +/// Get the number of frames used +pub fn used_frames() -> usize { + // TODO: Include bump allocator static pages? + FREELIST.lock().used_frames +} +pub fn total_frames() -> usize { + // TODO: Include bump allocator static pages? + sections().iter().map(|section| section.frames.len()).sum() +} + +/// Allocate a range of frames +pub fn allocate_p2frame(order: u32) -> Option { + allocate_p2frame_complex(order, (), None, order).map(|(f, _)| f) +} +pub fn allocate_frame() -> Option { + allocate_p2frame(0) +} +// TODO: Flags, strategy +pub fn allocate_p2frame_complex( + _req_order: u32, + _flags: (), + _strategy: Option<()>, + min_order: u32, +) -> Option<(Frame, usize)> { + let mut freelist = FREELIST.lock(); + + let (frame_order, frame) = freelist + .for_orders + .iter() + .enumerate() + .skip(min_order as usize) + .find_map(|(i, f)| f.map(|f| (i as u32, f)))?; + + let info = get_page_info(frame) + .unwrap_or_else(|| panic!("no page info for allocated frame {frame:?}")) + .as_free() + .expect("freelist frames must not be marked used!"); + let next_free = info.next(); + //info!("FREE {frame:?} ORDER {frame_order} NEXT_FREE {next_free:?}"); + + debug_assert_eq!( + next_free.order(), + frame_order, + "{frame:?}->next {next_free:?}.order != {frame_order}" + ); + if let Some(next) = next_free.frame() { + let f = get_free_alloc_page_info(next); + debug_assert_eq!(f.prev().frame(), Some(frame)); + debug_assert_ne!(next, frame); + debug_assert!( + next.is_aligned_to_order(frame_order), + "NEXT {next:?} UNALIGNED" + ); + f.set_prev(P2Frame::new(None, frame_order)); + } + + debug_assert!(frame.is_aligned_to_order(frame_order)); + debug_assert_eq!(next_free.order(), frame_order); + freelist.for_orders[frame_order as usize] = next_free.frame(); + + // TODO: Is this LIFO cache optimal? + //info!("MIN{min_order}FRAMEORD{frame_order}"); + for order in (min_order..frame_order).rev() { + //info!("SPLIT ORDER {order}"); + let order_page_count = 1 << order; + + let hi = frame.next_by(order_page_count); + //info!("SPLIT INTO {frame:?}:{hi:?} ORDER {order}"); + + debug_assert_eq!(freelist.for_orders[order as usize], None); + + let hi_info = get_page_info(hi) + .expect("sub-p2frame of split p2flame lacked PageInfo") + .make_free(order); + debug_assert!(!hi.is_aligned_to_order(frame_order)); + debug_assert!(hi.is_aligned_to_order(order)); + hi_info.set_next(P2Frame::new(None, order)); + hi_info.set_prev(P2Frame::new(None, order)); + freelist.for_orders[order as usize] = Some(hi); + } + + freelist.used_frames += 1 << min_order; + + info.mark_used(); + drop(freelist); + + unsafe { + (RmmA::phys_to_virt(frame.base()).data() as *mut u8).write_bytes(0, PAGE_SIZE << min_order); + } + + debug_assert!(frame.base().data() >= unsafe { ALLOCATOR_DATA.abs_off }); + + Some((frame, PAGE_SIZE << min_order)) +} + +pub unsafe fn deallocate_p2frame(orig_frame: Frame, order: u32) { + let mut freelist = FREELIST.lock(); + + let initial_info = get_page_info(orig_frame) + .unwrap_or_else(|| panic!("missing PageInfo for {orig_frame:?} being freed")); + initial_info.refcount.store(0, Ordering::Relaxed); + + let mut largest_order = order; + + let mut current = orig_frame; + + for merge_order in order..MAX_ORDER { + // Because there's a PageInfo, this frame must be allocator-owned. We need to be very + // careful with who owns this page, as the refcount can be anything from 0 (undefined) to + // 2^addrwidth - 1. However, allocation and deallocation must be synchronized (the "next" + // word of the PageInfo). + + let sibling = Frame::containing(PhysicalAddress::new( + current.base().data() ^ (PAGE_SIZE << merge_order), + )); + + let Some(_cur_info) = get_page_info(current) else { + unreachable!("attempting to free non-allocator-owned page"); + }; + + let Some(sib_info) = get_page_info(sibling) else { + // The frame that was deallocated, was at the unaligned start or end of its section + // (i.e. there aren't 1 << merge_order additional pages). + break; + }; + + let Some(sib_info) = sib_info.as_free() else { + // The frame is currently in use (refcounted). It cannot be merged! + break; + }; + + // If the sibling p2frame has lower order than merge_order, it cannot be merged into + // current. + if sib_info.next().order() < merge_order { + break; + } + debug_assert!( + (sib_info.next().order() <= merge_order), + "sibling page has unaligned order or contains current page" + ); + //info!("MERGED {lo:?} WITH {hi:?} ORDER {order}"); + + if let Some(sib_prev) = sib_info.prev().frame() { + get_free_alloc_page_info(sib_prev).set_next(sib_info.next()); + } else { + debug_assert_eq!(freelist.for_orders[merge_order as usize], Some(sibling)); + debug_assert!(sib_info + .next() + .frame() + .is_none_or(|f| f.is_aligned_to_order(merge_order))); + debug_assert_eq!(sib_info.next().order(), merge_order); + freelist.for_orders[merge_order as usize] = sib_info.next().frame(); + } + if let Some(sib_next) = sib_info.next().frame() { + get_free_alloc_page_info(sib_next).set_prev(sib_info.prev()); + } + + current = Frame::containing(PhysicalAddress::new( + current.base().data() & !(PAGE_SIZE << merge_order), + )); + + largest_order = merge_order + 1; + } + get_page_info(current) + .expect("freeing frame without PageInfo") + .make_free(largest_order); + + let new_head = current; + debug_assert!(new_head.is_aligned_to_order(largest_order)); + + if let Some(old_head) = freelist.for_orders[largest_order as usize].replace(new_head) { + //info!("HEAD {:p} FREED {:p} BARRIER {:p}", get_page_info(old_head).unwrap(), get_page_info(frame).unwrap(), unsafe { ALLOCATOR_DATA.abs_off as *const u8 }); + let old_head_info = get_free_alloc_page_info(old_head); + let new_head_info = get_free_alloc_page_info(new_head); + + new_head_info.set_next(P2Frame::new(Some(old_head), largest_order)); + new_head_info.set_prev(P2Frame::new(None, largest_order)); + old_head_info.set_prev(P2Frame::new(Some(new_head), largest_order)); + } + + //info!("FREED {frame:?}+2^{order}"); + freelist.used_frames -= 1 << order; +} + +pub unsafe fn deallocate_frame(frame: Frame) { + unsafe { deallocate_p2frame(frame, 0) } +} + +// Helper function for quickly mapping device memory +pub unsafe fn map_device_memory(addr: PhysicalAddress, len: usize) -> VirtualAddress { + unsafe { + let mut mapper = KernelMapper::lock_rw(); + let base = PhysicalAddress::new(round_down_pages(addr.data())); + let aligned_len = round_up_pages(len + (addr.data() - base.data())); + for page_idx in 0..aligned_len / crate::memory::PAGE_SIZE { + let (_, flush) = mapper + .map_linearly( + base.add(page_idx * crate::memory::PAGE_SIZE), + PageFlags::new().write(true).device_memory(true), + ) + .expect("failed to linearly map device memory"); + flush.flush(); + } + RmmA::phys_to_virt(addr) + } +} + +const ORDER_COUNT: u32 = 11; +const MAX_ORDER: u32 = ORDER_COUNT - 1; + +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct Frame { + // On x86/x86_64, all memory below 1 MiB is reserved, and although some frames in that range + // may end up in the paging code, it's very unlikely that frame 0x0 would. + physaddr: NonZeroUsize, +} + +/// Option combined with power-of-two size. +#[derive(Clone, Copy)] +struct P2Frame(usize); +impl P2Frame { + fn new(frame: Option, order: u32) -> Self { + Self(frame.map_or(0, |f| f.physaddr.get()) | (order as usize)) + } + fn get(self) -> (Option, u32) { + let page_off_mask = PAGE_SIZE - 1; + ( + NonZeroUsize::new(self.0 & !page_off_mask & !RC_USED_NOT_FREE) + .map(|physaddr| Frame { physaddr }), + (self.0 & page_off_mask) as u32, + ) + } + fn frame(self) -> Option { + self.get().0 + } + fn order(self) -> u32 { + self.get().1 + } +} +impl core::fmt::Debug for P2Frame { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let (frame, order) = self.get(); + write!(f, "[frame at {frame:?}] order {order}") + } +} + +impl core::fmt::Debug for Frame { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "[frame at {:p}]", self.base().data() as *const u8) + } +} + +impl Frame { + /// Create a frame containing `address` + pub fn containing(address: PhysicalAddress) -> Frame { + Frame { + physaddr: NonZeroUsize::new(address.data() & !PAGE_MASK) + .expect("frame 0x0 is reserved"), + } + } + + /// Get the address of this frame + pub fn base(self) -> PhysicalAddress { + PhysicalAddress::new(self.physaddr.get()) + } + + #[track_caller] + pub fn next_by(self, n: usize) -> Self { + Self { + physaddr: self + .physaddr + .get() + .checked_add(n * PAGE_SIZE) + .and_then(NonZeroUsize::new) + .expect("overflow or null in Frame::next_by"), + } + } + pub fn offset_from(self, from: Self) -> usize { + self.physaddr + .get() + .checked_sub(from.physaddr.get()) + .expect("overflow in Frame::offset_from") + / PAGE_SIZE + } + pub fn is_aligned_to_order(self, order: u32) -> bool { + self.base().data().is_multiple_of(PAGE_SIZE << order) + } +} + +#[derive(Debug)] +pub struct Enomem; + +impl From for Error { + fn from(_: Enomem) -> Self { + Self::new(ENOMEM) + } +} + +#[derive(Debug)] +pub struct RaiiFrame { + inner: Frame, +} +impl RaiiFrame { + pub fn allocate() -> Result { + init_frame(RefCount::One) + .map_err(|_| Enomem) + .map(|inner| Self { inner }) + } + pub unsafe fn new_unchecked(inner: Frame) -> Self { + Self { inner } + } + pub fn get(&self) -> Frame { + self.inner + } + pub fn take(self) -> Frame { + let f = self.get(); + core::mem::forget(self); + f + } +} + +impl Drop for RaiiFrame { + fn drop(&mut self) { + if get_page_info(self.inner) + .expect("RaiiFrame lacking PageInfo") + .remove_ref() + .is_none() + { + unsafe { + deallocate_frame(self.inner); + } + } + } +} + +// TODO: Make PageInfo a union, since *every* allocated page will have an associated PageInfo. +// Pages that aren't AddrSpace data pages, such as paging-structure pages, might use the memory +// occupied by a PageInfo for something else, potentially allowing paging structure-level CoW too. +// +// TODO: Another interesting possibility would be to use a slab allocator for (ideally +// power-of-two) allocations smaller than a page, in which case this PageInfo might store a bitmap +// of used sub-allocations. +// +// TODO: Alternatively or in conjunction, the PageInfo can store the number of used entries for +// each page table, possibly even recursively (total number of mapped pages). +// NOTE: init_sections depends on the default initialized value consisting of all zero bytes. +#[derive(Debug)] +pub struct PageInfo { + /// The RC_USED_NOT_FREE bit marks whether the frame is considered "used" (allocated) or "free" + /// (available to allocator). It must crucially be correct at all times, as the allocator can + /// look at the bits for siblings when determining whether to merge free p2frames with it to + /// form larger p2frames. Changing RC_USED_NOT_FREE is only allowed if the freelist is locked, + /// so that the allocator does not simultaneously free a page causing this page to be a sibling + /// to be merged, with potentially unset prev/next. + /// + /// If the frame is marked "used", the rest of the bits will track reference count and sharing + /// mode, where refcount is the number of present page table entries that point to this + /// particular frame, or other logical reference owners (such as RaiiFrame). (Unsurprisingly, + /// linear mapping of all frames to PHYS_OFFSET does not count.) + /// + /// With N being the pointer bit width, bits 0..=N-3 store actual reference count, whereas bit + /// RC_SHARED_NOT_COW indicates the page is shared if set, and CoW if unset. That flag is not + /// meaningful when the refcount is 0 or 1. + /// + /// If the frame is marked "free", this field stores the order in the sub-page-size bits and + /// the frame address of the previous linked page on top of it. The order is the n such that + /// this frame represents a p2frame of 2^n pages. + pub refcount: AtomicUsize, + + // TODO: Add one flag indicating whether the page contents is zeroed? Or should this primarily + // be managed by the memory allocator first? + /// This field is unused for "used" pages, and for "free" pages has the same format as + /// `refcount` except pointing to the next page in the doubly linked list. + pub next: AtomicUsize, +} + +struct PageInfoFree<'info> { + prev: &'info AtomicUsize, + next: &'info AtomicUsize, +} + +// There should be at least 2 bits available; even with a 4k page size on a 32-bit system (where a +// paging structure node is itself a 4k page size, i.e. on i386 with 1024 32-bit entries), there +// simply cannot be more than 2^30 entries pointing to the same page. However, to be able to use +// fetch_add safely, we reserve another bit (which makes fetch_add safe if properly reverted, and +// there aren't more than 2^(BITS-2) CPUs on the system). + +// Indicates whether the page is free (and thus managed by the allocator), or owned (and thus +// managed by the kernel heap, or most commonly, the virtual memory system). The refcount may +// increase or decrease with fetch_add, but must never flip this bit. +const RC_USED_NOT_FREE: usize = 1 << (usize::BITS - 1); + +// Only valid if RC_USED. Controls whether the page is CoW (map readonly, on page fault, copy and +// remap writable) or shared (mapped writable in the first place). +const RC_SHARED_NOT_COW: usize = 1 << (usize::BITS - 2); + +// The page refcount limit. This acts as a buffer zone allowing subsequent fetch_sub to correct +// overflow, which works as long as there's fewer CPUs than RC_MAX itself (and interrupts are +// disabled). +const RC_MAX: usize = 1 << (usize::BITS - 3); + +const RC_COUNT_MASK: usize = !(RC_USED_NOT_FREE | RC_SHARED_NOT_COW); + +// TODO: Use some of the flag bits as a tag, indicating the type of page (e.g. paging structure, +// userspace data page, or kernel heap page). This could be done only when debug assertions are +// enabled. +bitflags::bitflags! { + #[derive(Debug)] + pub struct FrameFlags: usize { + const NONE = 0; + } +} + +static mut ALLOCATOR_DATA: AllocatorData = AllocatorData { + sections: &[], + abs_off: 0, +}; + +struct AllocatorData { + // TODO: Memory hotplugging? + sections: &'static [Section], + abs_off: usize, +} +#[derive(Debug)] +struct FreeList { + for_orders: [Option; ORDER_COUNT as usize], + used_frames: usize, +} +static FREELIST: Mutex = Mutex::new(FreeList { + for_orders: [None; ORDER_COUNT as usize], + used_frames: 0, +}); + +pub struct Section { + base: Frame, + frames: &'static [PageInfo], +} + +pub const MAX_SECTION_SIZE_BITS: u32 = 27; +pub const MAX_SECTION_SIZE: usize = 1 << MAX_SECTION_SIZE_BITS; +pub const MAX_SECTION_PAGE_COUNT: usize = MAX_SECTION_SIZE / PAGE_SIZE; + +const _: () = { + assert!(size_of::().is_power_of_two()); +}; + +#[cold] +fn init_sections(mut allocator: BumpAllocator) { + let (free_areas, offset_into_first_free_area) = allocator.free_areas(); + + let free_areas_iter = || { + free_areas.iter().copied().enumerate().map(|(i, area)| { + if i == 0 { + rmm::MemoryArea { + base: area.base.add(offset_into_first_free_area), + size: area.size - offset_into_first_free_area, + } + } else { + area + } + }) + }; + + let sections: &'static mut [Section] = { + let max_section_count: usize = free_areas_iter() + .map(|area| { + let aligned_end = area + .base + .add(area.size) + .data() + .next_multiple_of(MAX_SECTION_SIZE); + let aligned_start = area.base.data() / MAX_SECTION_SIZE * MAX_SECTION_SIZE; + + (aligned_end - aligned_start) / MAX_SECTION_SIZE + }) + .sum(); + let section_array_page_count = + (max_section_count * size_of::
()).div_ceil(PAGE_SIZE); + + let base = allocator + .allocate(FrameCount::new(section_array_page_count)) + .expect("failed to allocate sections array"); + unsafe { + core::slice::from_raw_parts_mut( + RmmA::phys_to_virt(base).data() as *mut Section, + max_section_count, + ) + } + }; + + let mut iter = free_areas_iter().peekable(); + + let mut i = 0; + + while let Some(mut memory_map_area) = iter.next() { + // TODO: NonZeroUsize + + // TODO: x86_32 fails without this check + if memory_map_area.size == 0 { + continue; + } + + assert_ne!( + memory_map_area.size, 0, + "RMM should enforce areas are not zeroed" + ); + + // TODO: Should RMM do this? + + while let Some(next_area) = iter.peek() + && next_area.base == memory_map_area.base.add(memory_map_area.size) + { + memory_map_area.size += next_area.size; + let _ = iter.next(); + } + + assert_eq!( + memory_map_area.base.data() % PAGE_SIZE, + 0, + "RMM should enforce area alignment" + ); + assert_eq!( + memory_map_area.size % PAGE_SIZE, + 0, + "RMM should enforce area length alignment" + ); + + let mut pages_left = memory_map_area.size.div_floor(PAGE_SIZE); + let mut base = Frame::containing(memory_map_area.base); + + while pages_left > 0 { + let page_info_max_count = core::cmp::min(pages_left, MAX_SECTION_PAGE_COUNT); + let pages_to_next_section = + (MAX_SECTION_SIZE - (base.base().data() % MAX_SECTION_SIZE)) / PAGE_SIZE; + let page_info_count = core::cmp::min(page_info_max_count, pages_to_next_section); + + let page_info_array_size_pages = + (page_info_count * size_of::()).div_ceil(PAGE_SIZE); + let page_info_array = unsafe { + let base = allocator + .allocate(FrameCount::new(page_info_array_size_pages)) + .expect("failed to allocate page info array"); + core::slice::from_raw_parts_mut( + RmmA::phys_to_virt(base).data() as *mut PageInfo, + page_info_count, + ) + }; + for p in &*page_info_array { + assert_eq!(p.next.load(Ordering::Relaxed), 0); + assert_eq!(p.refcount.load(Ordering::Relaxed), 0); + } + + sections[i] = Section { + base, + frames: page_info_array, + }; + i += 1; + + pages_left -= page_info_count; + base = base.next_by(page_info_count); + } + } + let sections = &mut sections[..i]; + + sections.sort_unstable_by_key(|s| s.base); + + // The bump allocator has been used during the section array and page info array allocation + // phases, which means some of the PageInfos will be pointing to those arrays themselves. + // Mark those pages as used! + 'sections: for section in &*sections { + for (off, page_info) in section.frames.iter().enumerate() { + let frame = section.base.next_by(off); + if frame.base() >= allocator.abs_offset() { + break 'sections; + } + //info!("MARKING {frame:?} AS USED"); + page_info + .refcount + .store(RC_USED_NOT_FREE, Ordering::Relaxed); + page_info.next.store(0, Ordering::Relaxed); + } + } + + let mut first_pages: [Option<(Frame, &'static PageInfo)>; ORDER_COUNT as usize] = + [None; ORDER_COUNT as usize]; + let mut last_pages = first_pages; + + let mut append_page = |page: Frame, info: &'static PageInfo, order| { + let this_page = (page, info); + + if page.base() < allocator.abs_offset() { + return; + } + debug_assert!(info.as_free().is_some()); + debug_assert!(this_page.0.is_aligned_to_order(order)); + debug_assert_eq!(info.next.load(Ordering::Relaxed), order as usize); + debug_assert_eq!(info.refcount.load(Ordering::Relaxed), 0); + + let last_page = last_pages[order as usize].replace(this_page); + + if let Some((last_frame, last_page_info)) = last_page { + let last_info = last_page_info.as_free().unwrap(); + + debug_assert_eq!(last_info.next().order(), order); + debug_assert_eq!(last_info.next().frame(), None); + + last_info.set_next(P2Frame::new(Some(page), order)); + info.as_free() + .unwrap() + .set_prev(P2Frame::new(Some(last_frame), order)); + } else { + first_pages[order as usize] = Some(this_page); + info.as_free().unwrap().set_prev(P2Frame::new(None, order)); + info.as_free().unwrap().set_next(P2Frame::new(None, order)); + } + }; + unsafe { + ALLOCATOR_DATA = AllocatorData { + sections, + abs_off: allocator.abs_offset().data(), + }; + } + + for section in &*sections { + let mut base = section.base; + let mut frames = section.frames; + + for order in 0..=MAX_ORDER { + let pages_for_current_order = 1 << order; + + debug_assert_eq!(frames.len() % pages_for_current_order, 0); + debug_assert!(base.is_aligned_to_order(order)); + + if !frames.is_empty() && order != MAX_ORDER && !base.is_aligned_to_order(order + 1) { + frames[0].next.store(order as usize, Ordering::Relaxed); + // The first section page is not aligned to the next order size. + + //info!("ORDER {order}: FIRST {base:?}"); + append_page(base, &frames[0], order); + + base = base.next_by(pages_for_current_order); + frames = &frames[pages_for_current_order..]; + } else { + //info!("ORDER {order}: FIRST SKIP"); + } + + if !frames.is_empty() + && order != MAX_ORDER + && !base.next_by(frames.len()).is_aligned_to_order(order + 1) + { + // The last section page is not aligned to the next order size. + + let off = frames.len() - pages_for_current_order; + let final_page = base.next_by(off); + + frames[off].next.store(order as usize, Ordering::Relaxed); + + //info!("ORDER {order}: LAST {final_page:?}"); + append_page(final_page, &frames[off], order); + + frames = &frames[..off]; + } else { + //info!("ORDER {order}: LAST SKIP"); + } + + if frames.is_empty() { + break; + } + + if order == MAX_ORDER { + debug_assert_eq!(frames.len() % pages_for_current_order, 0); + debug_assert!(base.is_aligned_to_order(MAX_ORDER)); + + for (off, info) in frames.iter().enumerate().step_by(pages_for_current_order) { + info.next.store(MAX_ORDER as usize, Ordering::Relaxed); + append_page(base.next_by(off), info, MAX_ORDER); + } + } + } + + //info!("SECTION from {:?}, {} pages, array at {:p}", section.base, section.frames.len(), section.frames); + } + for (order, tuple_opt) in last_pages.iter().enumerate() { + let Some((frame, info)) = tuple_opt else { + continue; + }; + debug_assert!(frame.is_aligned_to_order(order as u32)); + let free = info.as_free().unwrap(); + debug_assert_eq!(free.prev().order(), order as u32); + free.set_next(P2Frame::new(None, order as u32)); + } + + FREELIST.lock().for_orders = first_pages.map(|pair| pair.map(|(frame, _)| frame)); + + //debug_freelist(); + debug!("Initial freelist consistent"); +} + +#[cold] +pub fn init_mm(allocator: BumpAllocator) { + init_sections(allocator); + + unsafe { + let the_frame = allocate_frame().expect("failed to allocate static zeroed frame"); + let the_info = get_page_info(the_frame).expect("static zeroed frame had no PageInfo"); + the_info + .refcount + .store(RefCount::One.to_raw(), Ordering::Relaxed); + + THE_ZEROED_FRAME.get().write(Some((the_frame, the_info))); + } +} +#[derive(Debug, PartialEq)] +pub enum AddRefError { + CowToShared, + SharedToCow, + RcOverflow, +} +impl PageInfo { + fn as_free(&self) -> Option> { + let this = &self; + let prev = this.refcount.load(Ordering::Relaxed); + + if prev & RC_USED_NOT_FREE == RC_USED_NOT_FREE { + None + } else { + Some(PageInfoFree { + prev: &this.refcount, + next: &this.next, + }) + } + } + pub fn add_ref(&self, kind: RefKind) -> Result<(), AddRefError> { + match (self.refcount().expect("cannot add_ref to free frame"), kind) { + (RefCount::One, RefKind::Cow) => { + self.refcount.store(RC_USED_NOT_FREE | 2, Ordering::Relaxed) + } + (RefCount::One, RefKind::Shared) => self + .refcount + .store(RC_USED_NOT_FREE | 2 | RC_SHARED_NOT_COW, Ordering::Relaxed), + (RefCount::Cow(_), RefKind::Cow) | (RefCount::Shared(_), RefKind::Shared) => { + let old = self.refcount.fetch_add(1, Ordering::Relaxed); + + // this is overflow-safe as long as the kernel can't be interrupted here, or the + // number of hw threads exceeds RC_COUNT_MAX + 1 - RC_MAX + if (old & RC_COUNT_MASK) >= RC_MAX { + self.refcount.fetch_sub(1, Ordering::Relaxed); + return Err(AddRefError::RcOverflow); + } + } + (RefCount::Cow(_), RefKind::Shared) => return Err(AddRefError::CowToShared), + (RefCount::Shared(_), RefKind::Cow) => return Err(AddRefError::SharedToCow), + } + Ok(()) + } + #[must_use = "must deallocate if refcount reaches None"] + pub fn remove_ref(&self) -> Option { + match self.refcount() { + None => panic!("refcount was already zero when calling remove_ref!"), + Some(RefCount::One) => { + // Used to be RC_USED_NOT_FREE | ?RC_SHARED_NOT_COW | 1, now becomes + // RC_USED_NOT_FREE + self.refcount.store(RC_USED_NOT_FREE, Ordering::Relaxed); + + None + } + Some(RefCount::Cow(_) | RefCount::Shared(_)) => RefCount::from_raw({ + // Used to be RC_USED_NOT_FREE | ?RC_SHARED_NOT_COW | n, now becomes + // RC_USED_NOT_FREE | ?RC_SHARED_NOT_COW | n - 1 + + // if the value returned from fetch_sub indicates count==1, the caller is + // responsible for freeing (return value will be None as mentioned above) + let new_value = self.refcount.fetch_sub(1, Ordering::Relaxed) - 1; + assert_ne!( + new_value, + RC_USED_NOT_FREE - 1, + "refcount underflow, allocator will break" + ); + assert_eq!( + new_value & RC_USED_NOT_FREE, + RC_USED_NOT_FREE, + "other malformed refcount" + ); + new_value + }), + } + } + #[track_caller] + pub fn allows_writable(&self) -> bool { + match self + .refcount() + .expect("using allows_writable on free page!") + { + RefCount::One => true, + RefCount::Cow(_) => false, + RefCount::Shared(_) => true, + } + } + + pub fn refcount(&self) -> Option { + let refcount = self.refcount.load(Ordering::Relaxed); + + RefCount::from_raw(refcount) + } + fn make_free(&self, order: u32) -> PageInfoFree<'_> { + // Order needs to be known so we don't for example merge A: [A] A A A B: [B] U U U into a + // 2^3 page (if U indicates "used"). + self.refcount.store(order as usize, Ordering::Relaxed); + self.next.store(order as usize, Ordering::Relaxed); + + PageInfoFree { + next: &self.next, + prev: &self.refcount, + } + } +} +impl PageInfoFree<'_> { + fn next(&self) -> P2Frame { + P2Frame(self.next.load(Ordering::Relaxed)) + } + #[track_caller] + fn set_next(&self, next: P2Frame) { + debug_assert!(next + .frame() + .is_none_or(|f| f.is_aligned_to_order(next.order()))); + self.next.store(next.0, Ordering::Relaxed) + } + fn prev(&self) -> P2Frame { + P2Frame(self.prev.load(Ordering::Relaxed)) + } + fn set_prev(&self, prev: P2Frame) { + debug_assert!(prev + .frame() + .is_none_or(|f| f.is_aligned_to_order(prev.order()))); + self.prev.store(prev.0, Ordering::Relaxed) + } + fn mark_used(&self) { + // Order is irrelevant if marked "used" + self.prev.store(RC_USED_NOT_FREE, Ordering::Relaxed); + self.next.store(0, Ordering::Relaxed); + } +} +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum RefKind { + Cow, + Shared, + // TODO: Observer? +} +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum RefCount { + One, + Shared(NonZeroUsize), + Cow(NonZeroUsize), +} +impl RefCount { + pub fn from_raw(raw: usize) -> Option { + if raw & RC_USED_NOT_FREE != RC_USED_NOT_FREE { + // Refcount not meaningful for free pages. + return None; + } + let refcount = raw & !(RC_SHARED_NOT_COW | RC_USED_NOT_FREE); + + // Refcount being zero means some other hw thread decreased it from one; hence it would be + // very invalid if this caller too, logically owned a reference to it. Reaching zero can for + // example occur if two threads simultaneously get Cow(2), then both fetch_sub + // simultaneously (meaning both are trying to free something). One of them will then notice + // it was the one decreasing from One, hence being responsible to free it. + let nz_refcount = NonZeroUsize::new(refcount)?; + + Some(if nz_refcount.get() == 1 { + RefCount::One + } else if raw & RC_SHARED_NOT_COW == RC_SHARED_NOT_COW { + RefCount::Shared(nz_refcount) + } else { + RefCount::Cow(nz_refcount) + }) + } + pub fn to_raw(self) -> usize { + match self { + Self::One => 1 | RC_USED_NOT_FREE, + Self::Shared(inner) => inner.get() | RC_SHARED_NOT_COW | RC_USED_NOT_FREE, + Self::Cow(inner) => inner.get() | RC_USED_NOT_FREE, + } + } +} +#[inline] +fn sections() -> &'static [Section] { + unsafe { ALLOCATOR_DATA.sections } +} +pub fn get_page_info(frame: Frame) -> Option<&'static PageInfo> { + let sections = sections(); + + let idx_res = sections.binary_search_by_key(&frame, |section| section.base); + + if idx_res == Err(0) { + // The frame is before the first section + return None; + } + + // binary_search_by_key returns either Ok(where it was found) or Err(where it would have been + // inserted). The base obviously cannot have been exactly matched from an entry at an + // out-of-bounds index, so the only Err(i) where i - 1 is out of bounds, is for i=0. That + // has already been checked. + let section = §ions[idx_res.unwrap_or_else(|e| e - 1)]; + + section.frames.get(frame.offset_from(section.base)) +} + +#[track_caller] +fn get_free_alloc_page_info(frame: Frame) -> PageInfoFree<'static> { + let i = get_page_info(frame).unwrap_or_else(|| { + panic!("allocator-owned frames need a PageInfo, but none for {frame:?}") + }); + i.as_free() + .unwrap_or_else(|| panic!("expected frame to be free, but {frame:?} wasn't, in {i:?}")) +} + +pub struct Segv; + +bitflags! { + /// Arch-generic page fault flags, modeled after x86's error code. + /// + /// This may change when arch-specific features are utilized better. + pub struct GenericPfFlags: u32 { + const PRESENT = 1 << 0; + const INVOLVED_WRITE = 1 << 1; + const USER_NOT_SUPERVISOR = 1 << 2; + const INSTR_NOT_DATA = 1 << 3; + // "reserved bits" on x86 + const INVL = 1 << 31; + } +} + +pub trait ArchIntCtx { + fn ip(&self) -> usize; + fn recover_and_efault(&mut self); +} + +pub fn page_fault_handler( + stack: &mut impl ArchIntCtx, + code: GenericPfFlags, + faulting_address: VirtualAddress, +) -> Result<(), Segv> { + let faulting_page = Page::containing_address(faulting_address); + + let usercopy_region = __usercopy_start()..__usercopy_end(); + + // TODO: Most likely not necessary, but maybe also check that the faulting address is not too + // close to USER_END. + let address_is_user = faulting_address.kind() == TableKind::User; + + let invalid_page_tables = code.contains(GenericPfFlags::INVL); + let caused_by_user = code.contains(GenericPfFlags::USER_NOT_SUPERVISOR); + let caused_by_kernel = !caused_by_user; + let caused_by_write = code.contains(GenericPfFlags::INVOLVED_WRITE); + let caused_by_instr_fetch = code.contains(GenericPfFlags::INSTR_NOT_DATA); + let is_usercopy = usercopy_region.contains(&stack.ip()); + + let mode = match (caused_by_write, caused_by_instr_fetch) { + (true, false) => AccessMode::Write, + (false, false) => AccessMode::Read, + (false, true) => AccessMode::InstrFetch, + (true, true) => { + unreachable!("page fault cannot be caused by both instruction fetch and write") + } + }; + + if invalid_page_tables { + // TODO: Better error code than Segv? + return Err(Segv); + } + + if address_is_user && (caused_by_user || is_usercopy) { + let mut token = unsafe { CleanLockToken::new() }; + match context::memory::try_correcting_page_tables(faulting_page, mode, &mut token) { + Ok(()) => return Ok(()), + Err(PfError::Oom) => todo!("oom"), + Err(PfError::Segv | PfError::RecursionLimitExceeded) => (), + Err(PfError::NonfatalInternalError) => todo!(), + } + } + + if address_is_user && caused_by_kernel && mode != AccessMode::InstrFetch && is_usercopy { + stack.recover_and_efault(); + return Ok(()); + } + + Err(Segv) +} +static THE_ZEROED_FRAME: SyncUnsafeCell> = + SyncUnsafeCell::new(None); + +pub fn the_zeroed_frame() -> (Frame, &'static PageInfo) { + unsafe { + THE_ZEROED_FRAME + .get() + .read() + .expect("zeroed frame must be initialized") + } +} + +pub fn init_frame(init_rc: RefCount) -> Result { + let new_frame = allocate_frame().ok_or(PfError::Oom)?; + let page_info = get_page_info(new_frame).unwrap_or_else(|| { + panic!( + "all allocated frames need an associated page info, {:?} didn't", + new_frame + ) + }); + debug_assert_eq!(page_info.refcount(), Some(RefCount::One)); + page_info + .refcount + .store(init_rc.to_raw(), Ordering::Relaxed); + + Ok(new_frame) +} +#[derive(Debug)] +pub struct TheFrameAllocator; + +unsafe impl FrameAllocator for TheFrameAllocator { + fn allocate(&mut self, count: FrameCount) -> Option { + let order = count.data().next_power_of_two().trailing_zeros(); + allocate_p2frame(order).map(|f| f.base()) + } + unsafe fn free(&mut self, address: PhysicalAddress, count: FrameCount) { + unsafe { + let order = count.data().next_power_of_two().trailing_zeros(); + deallocate_p2frame(Frame::containing(address), order) + } + } + fn usage(&self) -> FrameUsage { + FrameUsage::new( + FrameCount::new(used_frames()), + FrameCount::new(total_frames()), + ) + } +} diff --git a/src/memory/page.rs b/src/memory/page.rs new file mode 100644 index 0000000000..3d5c2a21e3 --- /dev/null +++ b/src/memory/page.rs @@ -0,0 +1,85 @@ +// Some code was borrowed from [Phil Opp's Blog](http://os.phil-opp.com/modifying-page-tables.html) + +use core::fmt::Debug; +use rmm::{Arch, VirtualAddress}; + +use crate::memory::RmmA; + +/// Size of pages +pub const PAGE_SIZE: usize = RmmA::PAGE_SIZE; +pub const PAGE_MASK: usize = RmmA::PAGE_OFFSET_MASK; + +/// Page +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct Page { + number: usize, +} + +impl Page { + pub fn start_address(self) -> VirtualAddress { + VirtualAddress::new(self.number * PAGE_SIZE) + } + + pub fn containing_address(address: VirtualAddress) -> Page { + //TODO assert!(address.data() < 0x0000_8000_0000_0000 || address.data() >= 0xffff_8000_0000_0000, + // "invalid address: 0x{:x}", address.data()); + Page { + number: address.data() / PAGE_SIZE, + } + } + + pub fn range_inclusive(start: Page, r#final: Page) -> PageIter { + PageIter { + start, + end: r#final.next(), + } + } + pub fn next(self) -> Page { + self.next_by(1) + } + pub fn next_by(self, n: usize) -> Page { + Self { + number: self.number + n, + } + } + pub fn offset_from(self, other: Self) -> usize { + self.number - other.number + } +} +impl Debug for Page { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "[page at {:p}]", + self.start_address().data() as *const u8 + ) + } +} + +pub struct PageIter { + start: Page, + end: Page, +} + +impl Iterator for PageIter { + type Item = Page; + + fn next(&mut self) -> Option { + if self.start < self.end { + let page = self.start; + self.start = self.start.next(); + Some(page) + } else { + None + } + } +} + +/// Round down to the nearest multiple of page size +pub fn round_down_pages(number: usize) -> usize { + number.div_floor(PAGE_SIZE) * PAGE_SIZE +} +/// Round up to the nearest multiple of page size +pub fn round_up_pages(number: usize) -> usize { + number.next_multiple_of(PAGE_SIZE) +} diff --git a/src/panic.rs b/src/panic.rs new file mode 100644 index 0000000000..50c05dcaa6 --- /dev/null +++ b/src/panic.rs @@ -0,0 +1,202 @@ +//! Intrinsics for panic handling + +use core::{panic::PanicInfo, slice}; + +#[cfg(target_pointer_width = "32")] +use object::elf::FileHeader32 as FileHeader; +#[cfg(target_pointer_width = "64")] +use object::elf::FileHeader64 as FileHeader; +use object::{ + elf, + read::elf::{FileHeader as _, Sym as _}, + NativeEndian, +}; +use rmm::VirtualAddress; +use rustc_demangle::demangle; + +use crate::{ + arch::{ + consts::USER_END_OFFSET, + interrupt::{self, trace::StackTrace, InterruptStack}, + }, + context, cpu_id, + memory::KernelMapper, + sync::CleanLockToken, + syscall::{self, usercopy::UserSliceRo}, +}; + +/// Required to handle panics +#[cfg(not(test))] +#[panic_handler] +fn panic_handler(info: &PanicInfo) -> ! { + panic_handler_inner(info) +} + +#[cfg_attr(test, expect(dead_code))] +fn panic_handler_inner(info: &PanicInfo) -> ! { + println!("KERNEL PANIC: {}", info); + + unsafe { + stack_trace(); + } + + let Some(context_lock) = context::try_current() else { + println!("CPU {}, CID ", cpu_id()); + + println!("HALT"); + loop { + unsafe { + interrupt::halt(); + } + } + }; + + println!("CPU {}, CID {:p}", cpu_id(), context_lock); + + { + // This could deadlock, but at this point we are going to halt anyways + let mut token = unsafe { CleanLockToken::new() }; + let context = context_lock.read(token.token()); + println!("NAME: {}, DEBUG ID: {}", context.name, context.debug_id); + + if let Some([a, b, c, d, e, f, g]) = context.current_syscall() { + println!( + "SYSCALL: {}", + syscall::debug::format_call(a, b, c, d, e, f, g) + ); + } + } + + println!("HALT"); + loop { + unsafe { + interrupt::halt(); + } + } +} + +/// Get a stack trace +#[inline(never)] +pub unsafe fn stack_trace() { + unsafe { + let mapper = KernelMapper::lock_ro(); + + let kernel_ptr = crate::kernel_executable_offsets::KERNEL_OFFSET() as *const u8; + let elf_header: &FileHeader = object::pod::from_bytes(slice::from_raw_parts( + kernel_ptr, + size_of::>(), + )) + .unwrap() + .0; + + // This assumes that the linker places .shstrtab as last section. If it + // isn't, that just causes a recursive panic, not UB. + let kernel_size = elf_header.e_shoff(NativeEndian) as usize + + usize::from(elf_header.e_shnum(NativeEndian)) + * usize::from(elf_header.e_shentsize(NativeEndian)); + let kernel_slice = slice::from_raw_parts(kernel_ptr, kernel_size); + + let symbols = elf_header + .sections(NativeEndian, kernel_slice) + .unwrap() + .symbols(NativeEndian, kernel_slice, elf::SHT_SYMTAB) + .unwrap(); + + let mut frame = StackTrace::start(); + + //Maximum 64 frames + for _ in 0..64 { + let Some(frame_) = frame else { + break; + }; + let fp_virt = VirtualAddress::new(frame_.fp); + let pc_virt = VirtualAddress::new(frame_.pc_ptr as usize); + if !(fp_virt.data() >= USER_END_OFFSET + && pc_virt.data() >= USER_END_OFFSET + && (fp_virt.data() as *const usize).is_aligned() + && (pc_virt.data() as *const usize).is_aligned() + && mapper.translate(fp_virt).is_some() + && mapper.translate(pc_virt).is_some()) + { + println!(" {:>016x}: GUARD PAGE", frame_.fp); + break; + } + + let pc = *frame_.pc_ptr; + if pc == 0 { + println!(" {:>016x}: EMPTY RETURN", frame_.fp); + break; + } + + println!(" FP {:>016x}: PC {:>016x}", frame_.fp, pc); + + for sym in symbols.iter() { + if sym.st_type() != elf::STT_FUNC { + continue; + } + let sym_addr = sym.st_value.get(NativeEndian) as usize; + if !(pc >= sym_addr && pc < sym_addr + sym.st_size.get(NativeEndian) as usize) { + continue; + } + + println!(" {:>016X}+{:>04X}", sym_addr, pc - sym_addr); + + if let Some(sym_name) = sym + .name(NativeEndian, symbols.strings()) + .ok() + .and_then(|name| core::str::from_utf8(name).ok()) + { + println!(" {:#}", demangle(sym_name)); + } + } + frame = frame_.next(); + } + } +} + +/// Get a user stack trace +#[inline(never)] +pub unsafe fn user_stack_trace(stack: &InterruptStack) { + let mut fp = stack.frame_pointer(); + let sp = stack.stack_pointer(); + + if fp < sp { + println!(" "); + return; + } + if fp >= crate::USER_END_OFFSET { + return; + } + + for _ in 0..64 { + if fp == 0 || fp >= crate::USER_END_OFFSET { + break; + } + let rip_addr = fp + size_of::(); + let rip = match UserSliceRo::new(rip_addr, size_of::()).and_then(|x| x.read_usize()) + { + Ok(val) => val, + Err(err) => { + println!(" 016x}>: {}", fp, err); + break; + } + }; + println!(" FP {:>016x}: PC {:>016x}", fp, rip); + if rip == 0 { + break; + } + + let next_fp = match UserSliceRo::new(fp, size_of::()).and_then(|x| x.read_usize()) { + Ok(val) => val, + Err(_err) => break, + }; + if next_fp <= fp { + println!( + " 016x}; stack walk ended>", + next_fp + ); + break; + } + fp = next_fp; + } +} diff --git a/src/percpu.rs b/src/percpu.rs new file mode 100644 index 0000000000..f4ad5e66e6 --- /dev/null +++ b/src/percpu.rs @@ -0,0 +1,205 @@ +use alloc::{ + sync::{Arc, Weak}, + vec::Vec, +}; +use core::{ + cell::{Cell, RefCell}, + sync::atomic::{AtomicBool, AtomicPtr, Ordering}, +}; + +use rmm::Arch; +use syscall::PtraceFlags; + +use crate::{ + arch::device::ArchPercpuMisc, + context::{empty_cr3, memory::AddrSpaceWrapper, switch::ContextSwitchPercpu}, + cpu_set::{LogicalCpuId, MAX_CPU_COUNT}, + cpu_stats::{CpuStats, CpuStatsData}, + ptrace::Session, + sync::CleanLockToken, + syscall::debug::SyscallDebugInfo, +}; + +/// The percpu block, that stored all percpu variables. +pub struct PercpuBlock { + /// A unique immutable number that identifies the current CPU - used for scheduling + pub cpu_id: LogicalCpuId, + + /// Context management + pub switch_internals: ContextSwitchPercpu, + + pub current_addrsp: RefCell>>, + pub new_addrsp_tmp: Cell>>, + pub wants_tlb_shootdown: AtomicBool, + pub balance: Cell<[usize; 40]>, + pub last_queue: Cell, + + // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it + // first to avoid cache invalidation. + pub profiling: Option<&'static crate::profiling::RingBuffer>, + + pub ptrace_flags: Cell, + pub ptrace_session: RefCell>>, + pub inside_syscall: Cell, + + pub syscall_debug_info: Cell, + + pub misc_arch_info: crate::arch::device::ArchPercpuMisc, + + pub stats: CpuStats, +} + +static ALL_PERCPU_BLOCKS: [AtomicPtr; MAX_CPU_COUNT as usize] = + [const { AtomicPtr::new(core::ptr::null_mut()) }; MAX_CPU_COUNT as usize]; + +#[allow(unused)] +pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) { + ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release) +} + +pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> { + let mut res = ALL_PERCPU_BLOCKS + .iter() + .filter_map(|block| unsafe { block.load(Ordering::Relaxed).as_ref() }) + .map(|block| { + let stats = &block.stats; + (block.cpu_id, stats.into()) + }) + .collect::>(); + res.sort_unstable_by_key(|(id, _stats)| id.get()); + res +} + +// PercpuBlock::current() is implemented somewhere in the arch-specific modules + +pub fn shootdown_tlb_ipi(target: Option) { + if cfg!(not(feature = "multi_core")) { + return; + } + + if let Some(target) = target { + let my_percpublock = PercpuBlock::current(); + assert_ne!(target, my_percpublock.cpu_id); + + let Some(percpublock) = (unsafe { + ALL_PERCPU_BLOCKS[target.get() as usize] + .load(Ordering::Acquire) + .as_ref() + }) else { + warn!("Trying to TLB shootdown a CPU that doesn't exist or isn't initialized."); + return; + }; + #[expect(clippy::bool_comparison)] + while percpublock + .wants_tlb_shootdown + .swap(true, Ordering::Release) + == true + { + // Load is faster than CAS or on x86, LOCK BTS + while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { + my_percpublock.maybe_handle_tlb_shootdown(); + core::hint::spin_loop(); + } + } + + crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock); + } else { + for id in 0..crate::cpu_count() { + // TODO: Optimize: use global counter and percpu ack counters, send IPI using + // destination shorthand "all CPUs". + shootdown_tlb_ipi(Some(LogicalCpuId::new(id))); + } + } +} +impl PercpuBlock { + pub fn maybe_handle_tlb_shootdown(&self) { + #[expect(clippy::bool_comparison)] + if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false { + return; + } + + // TODO: Finer-grained flush + crate::memory::RmmA::invalidate_all(); + + if let Some(addrsp) = &*self.current_addrsp.borrow() { + addrsp.tlb_ack.fetch_add(1, Ordering::Release); + } + } +} +pub unsafe fn switch_arch_hook() { + unsafe { + let percpu = PercpuBlock::current(); + + let cur_addrsp = percpu.current_addrsp.borrow(); + let next_addrsp = percpu.new_addrsp_tmp.take(); + + let retain_pgtbl = match (&*cur_addrsp, &next_addrsp) { + (Some(p), Some(n)) => Arc::ptr_eq(p, n), + (Some(_), None) | (None, Some(_)) => false, + (None, None) => true, + }; + if retain_pgtbl { + // If we are not switching to a different address space, we can simply return early. + return; + } + if let Some(prev_addrsp) = &*cur_addrsp { + prev_addrsp.used_by.atomic_clear(percpu.cpu_id); + + // See [`Flusher::flush`]. + // + // Without the fence, `wants_tlb_shootdown` check *may* happen + // before the CPU is removed from the `used_by` set. Hence, if a + // shootdown request arises *after* the check and *before* removing + // the CPU from the set, it would be missed and the CPU who + // requested the shootdown would spin forever since the request was + // never ACKed. + core::sync::atomic::fence(Ordering::SeqCst); + + percpu.maybe_handle_tlb_shootdown(); + } + + drop(cur_addrsp); + + // Tell future TLB shootdown handlers that old_addrsp_tmp is no longer the current address + // space. + *percpu.current_addrsp.borrow_mut() = next_addrsp; + + match &*percpu.current_addrsp.borrow() { + Some(next_addrsp) => { + next_addrsp.used_by.atomic_set(percpu.cpu_id); + let mut token = CleanLockToken::new(); + let mut token = token.token(); + let next = next_addrsp.acquire_read(token.downgrade()); + + next.table.utable.make_current(); + } + _ => { + crate::memory::RmmA::set_table(rmm::TableKind::User, empty_cr3()); + } + } + } +} +impl PercpuBlock { + pub const fn init(cpu_id: LogicalCpuId) -> Self { + Self { + cpu_id, + switch_internals: ContextSwitchPercpu::default(), + current_addrsp: RefCell::new(None), + new_addrsp_tmp: Cell::new(None), + wants_tlb_shootdown: AtomicBool::new(false), + balance: Cell::new([0; 40]), + last_queue: Cell::new(39), + ptrace_flags: Cell::new(PtraceFlags::empty()), + ptrace_session: RefCell::new(None), + inside_syscall: Cell::new(false), + + syscall_debug_info: Cell::new(SyscallDebugInfo::default()), + + profiling: None, + + misc_arch_info: ArchPercpuMisc::default(), + + stats: CpuStats::default(), + } + } +} diff --git a/src/profiling.rs b/src/profiling.rs new file mode 100644 index 0000000000..79ed7fe085 --- /dev/null +++ b/src/profiling.rs @@ -0,0 +1,330 @@ +use alloc::{boxed::Box, vec::Vec}; +use core::{ + cell::{SyncUnsafeCell, UnsafeCell}, + mem::size_of, + sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicUsize, Ordering}, +}; +#[cfg(target_arch = "x86_64")] +use rmm::Arch; + +#[cfg(feature = "profiling")] +use crate::arch::{idt::Idt, interrupt::irq::aux_timer}; +#[cfg(target_arch = "x86_64")] +use crate::arch::{ + interrupt::{self, InterruptStack}, + CurrentRmmArch, +}; +use crate::{ + cpu_set::LogicalCpuId, + percpu::PercpuBlock, + syscall::{error::*, usercopy::UserSliceWo}, +}; + +#[cfg(all(feature = "profiling", not(target_arch = "x86_64")))] +compile_error!("Profiling not supported outside x86_64"); + +const N: usize = 16 * 1024 * 1024; + +pub struct RingBuffer { + head: AtomicUsize, + tail: AtomicUsize, + buf: &'static [UnsafeCell; N], + pub(crate) nmi_kcount: AtomicUsize, + pub(crate) nmi_ucount: AtomicUsize, +} + +impl RingBuffer { + unsafe fn advance_head(&self, n: usize) { + self.head.store( + self.head.load(Ordering::Acquire).wrapping_add(n), + Ordering::Release, + ); + } + unsafe fn advance_tail(&self, n: usize) { + self.tail.store( + self.tail.load(Ordering::Acquire).wrapping_add(n), + Ordering::Release, + ); + } + unsafe fn sender_owned(&self) -> [&[UnsafeCell]; 2] { + let head = self.head.load(Ordering::Acquire) % N; + let tail = self.tail.load(Ordering::Acquire) % N; + + if head <= tail { + [&self.buf[tail..], &self.buf[..head]] + } else { + [&self.buf[tail..head], &[]] + } + } + unsafe fn receiver_owned(&self) -> [&[UnsafeCell]; 2] { + let head = self.head.load(Ordering::Acquire) % N; + let tail = self.tail.load(Ordering::Acquire) % N; + + if head > tail { + [&self.buf[head..], &self.buf[..tail]] + } else { + [&self.buf[head..tail], &[]] + } + } + pub unsafe fn extend(&self, mut slice: &[usize]) -> usize { + let mut n = 0; + for mut sender_slice in unsafe { self.sender_owned() } { + while !slice.is_empty() && !sender_slice.is_empty() { + unsafe { sender_slice[0].get().write(slice[0]) }; + slice = &slice[1..]; + sender_slice = &sender_slice[1..]; + n += 1; + } + } + unsafe { self.advance_tail(n) }; + n + } + pub unsafe fn peek(&self) -> [&[usize]; 2] { + unsafe { + self.receiver_owned() + .map(|slice| core::slice::from_raw_parts(slice.as_ptr().cast(), slice.len())) + } + } + pub unsafe fn advance(&self, n: usize) { + unsafe { self.advance_head(n) } + } + pub fn create() -> &'static Self { + Box::leak(Box::new(Self { + head: AtomicUsize::new(0), + tail: AtomicUsize::new(0), + buf: Box::leak(unsafe { Box::new_zeroed().assume_init() }), + nmi_kcount: AtomicUsize::new(0), + nmi_ucount: AtomicUsize::new(0), + })) + } +} + +// SAFETY: must only be written by BSP, then constant +// TODO: probably insignificant, but maybe perf can be improved by replacing AtmomicPtr with +// SyncUnsafeCell? +static BUFS_RAW: SyncUnsafeCell<&'static [AtomicPtr]> = SyncUnsafeCell::new(&[]); + +pub fn bufs() -> &'static [AtomicPtr] { + unsafe { *BUFS_RAW.get() } +} + +pub const PROFILE_TOGGLEABLE: bool = true; +pub static IS_PROFILING: AtomicBool = AtomicBool::new(false); + +pub fn serio_command(index: usize, data: u8) { + if cfg!(not(feature = "profiling")) { + return; + } + + if PROFILE_TOGGLEABLE { + if index == 0 && data == 30 { + // "a" key in QEMU + info!("Enabling profiling"); + IS_PROFILING.store(true, Ordering::SeqCst); + } else if index == 0 && data == 48 { + // "b" key + info!("Disabling profiling"); + IS_PROFILING.store(false, Ordering::SeqCst); + } + } +} + +#[cfg_attr(not(feature = "profiling"), expect(dead_code))] +pub fn drain_buffer(cpu_num: LogicalCpuId, buf: UserSliceWo) -> Result { + unsafe { + let Some(src) = bufs() + .get(cpu_num.get() as usize) + .ok_or(Error::new(EBADFD))? + .load(Ordering::Relaxed) + .as_ref() + else { + return Ok(0); + }; + let byte_slices = src.peek().map(|words| { + core::slice::from_raw_parts(words.as_ptr().cast::(), size_of_val(words)) + }); + + let copied_1 = buf.copy_common_bytes_from_slice(byte_slices[0])?; + src.advance(copied_1 / size_of::()); + + let copied_2 = if let Some(remaining) = buf.advance(copied_1) { + remaining.copy_common_bytes_from_slice(byte_slices[1])? + } else { + 0 + }; + src.advance(copied_2 / size_of::()); + + Ok(copied_1 + copied_2) + } +} + +#[cfg(target_arch = "x86_64")] +pub unsafe fn nmi_handler(stack: &InterruptStack) { + if cfg!(not(feature = "profiling")) { + return; + } + + let Some(profiling) = crate::percpu::PercpuBlock::current().profiling else { + return; + }; + if !IS_PROFILING.load(Ordering::Relaxed) { + return; + } + if stack.iret.cs & 0b00 == 0b11 { + profiling.nmi_ucount.fetch_add(1, Ordering::Relaxed); + return; + } else if stack.iret.rflags & (1 << 9) != 0 { + // Interrupts were enabled, i.e. we were in kmain, so ignore. + return; + } else { + profiling.nmi_kcount.fetch_add(1, Ordering::Relaxed); + }; + + let mut buf = [0_usize; 32]; + buf[0] = stack.iret.rip & !(1 << 63); + buf[1] = unsafe { x86::time::rdtsc() } as usize; + + let mut bp = stack.preserved.rbp; + + let mut len = 2; + + for i in 2..32 { + if bp < CurrentRmmArch::PHYS_OFFSET + || bp.saturating_add(16) >= CurrentRmmArch::PHYS_OFFSET + crate::PML4_SIZE + { + break; + } + let ip = unsafe { ((bp + 8) as *const usize).read() }; + bp = unsafe { (bp as *const usize).read() }; + + if ip < crate::kernel_executable_offsets::__text_start() + || ip >= crate::kernel_executable_offsets::__text_end() + { + break; + } + buf[i] = ip; + + len = i + 1; + } + + let _ = unsafe { profiling.extend(&buf[..len]) }; +} + +static NUM_ORDINARY_CPUS: AtomicU32 = AtomicU32::new(u32::MAX); + +#[cfg(feature = "profiling")] +pub fn cpu_exists(cpu: LogicalCpuId) -> bool { + cpu.get() < NUM_ORDINARY_CPUS.load(Ordering::Relaxed) +} + +fn profiler_cpu() -> LogicalCpuId { + #[cfg(feature = "profiling")] + return LogicalCpuId::new(NUM_ORDINARY_CPUS.load(Ordering::SeqCst)); + + #[cfg(not(feature = "profiling"))] + return LogicalCpuId::new(u32::MAX); +} + +// SAFETY: must be called before any init() +pub unsafe fn allocate(total_cpu_count: u32) { + if cfg!(not(feature = "profiling")) { + return; + } + + info!("Preliminary number of CPUs: {total_cpu_count}"); + + let ordinary_cpu_count = total_cpu_count.checked_sub(1).unwrap(); + NUM_ORDINARY_CPUS.store(ordinary_cpu_count, Ordering::SeqCst); + + let slice = Box::leak( + ((0..ordinary_cpu_count as usize) + .map(|_| AtomicPtr::new(core::ptr::null_mut())) + .collect::>()) + .into_boxed_slice(), + ); + unsafe { + BUFS_RAW.get().write(slice); + } +} + +// SAFETY: must be called after allocate() or data races may occur +pub unsafe fn init() { + if cfg!(not(feature = "profiling")) { + return; + } + + let percpu = PercpuBlock::current(); + + if percpu.cpu_id == profiler_cpu() { + return; + } + + let profiling = RingBuffer::create(); + + bufs()[percpu.cpu_id.get() as usize].store( + (profiling as *const RingBuffer).cast_mut(), + core::sync::atomic::Ordering::SeqCst, + ); + unsafe { + (core::ptr::addr_of!(percpu.profiling) as *mut Option<&'static RingBuffer>) + .write(Some(profiling)); + } +} + +static ACK: AtomicU32 = AtomicU32::new(0); + +pub fn ready_for_profiling() { + if cfg!(not(feature = "profiling")) { + return; + } + + ACK.fetch_add(1, Ordering::Relaxed); +} + +pub fn maybe_run_profiling_helper_forever(cpu_id: LogicalCpuId) { + if cfg!(not(feature = "profiling")) { + return; + } + + if cpu_id != profiler_cpu() { + return; + } + #[cfg(target_arch = "x86_64")] + unsafe { + for i in 33..255 { + crate::arch::idt::IDTS + .write() + .get_mut(&cpu_id) + .unwrap() + .entries[i] + .set_func(crate::arch::interrupt::ipi::wakeup); + } + + let apic = &mut crate::arch::device::local_apic::the_local_apic(); + apic.set_lvt_timer((0b01 << 17) | 32); + apic.set_div_conf(0b1011); + apic.set_init_count(0xffff_f); + + while ACK.load(Ordering::Relaxed) < NUM_ORDINARY_CPUS.load(Ordering::SeqCst) { + core::hint::spin_loop(); + } + + interrupt::enable_and_nop(); + loop { + interrupt::halt(); + } + } +} + +#[cfg(feature = "profiling")] +pub fn maybe_setup_timer(idt: &mut Idt, cpu_id: LogicalCpuId) { + if cfg!(not(feature = "profiling")) { + return; + } + + if cpu_id != profiler_cpu() { + return; + } + idt.entries[32].set_func(aux_timer); + idt.set_reserved_mut(32, true); +} diff --git a/src/ptrace.rs b/src/ptrace.rs new file mode 100644 index 0000000000..6cf59407e3 --- /dev/null +++ b/src/ptrace.rs @@ -0,0 +1,248 @@ +//! The backend of the "proc:" scheme. Most internal breakpoint +//! handling should go here, unless they closely depend on the design +//! of the scheme. + +use crate::{ + event, + percpu::PercpuBlock, + scheme::SchemeExt, + sync::{CleanLockToken, WaitCondition}, + syscall::{data::PtraceEvent, error::*, flag::*, ptrace_event}, +}; + +use alloc::{collections::VecDeque, sync::Arc}; +use core::cmp; +use spin::Mutex; +use syscall::data::GlobalSchemes; + +// ____ _ +// / ___| ___ ___ ___(_) ___ _ __ ___ +// \___ \ / _ \/ __/ __| |/ _ \| '_ \/ __| +// ___) | __/\__ \__ \ | (_) | | | \__ \ +// |____/ \___||___/___/_|\___/|_| |_|___/ + +#[derive(Debug)] +pub struct SessionData { + pub(crate) breakpoint: Option, + events: VecDeque, + file_id: usize, +} +impl SessionData { + fn add_event(&mut self, event: PtraceEvent, token: &mut CleanLockToken) { + self.events.push_back(event); + + // Notify nonblocking tracers + if self.events.len() == 1 { + // If the list of events was previously empty, alert now + proc_trigger_event(self.file_id, EVENT_READ, token); + } + } + + /// Override the breakpoint for the specified tracee. Pass `None` to clear + /// breakpoint. + pub fn set_breakpoint(&mut self, flags: Option) { + self.breakpoint = flags.map(|flags| Breakpoint { + reached: false, + flags, + }); + } + + /// Returns true if the breakpoint is reached, or if there isn't a + /// breakpoint + pub fn is_reached(&self) -> bool { + self.breakpoint.as_ref().map(|b| b.reached).unwrap_or(false) + } + + /// Used for getting the flags in fevent + pub fn session_fevent_flags(&self) -> EventFlags { + let mut flags = EventFlags::empty(); + + if !self.events.is_empty() { + flags |= EVENT_READ; + } + + flags + } + + /// Poll events, return the amount read. This drains events from the queue. + pub fn recv_events(&mut self, out: &mut [PtraceEvent]) -> usize { + let len = cmp::min(out.len(), self.events.len()); + for (dst, src) in out.iter_mut().zip(self.events.drain(..len)) { + *dst = src; + } + len + } +} + +#[derive(Debug)] +pub struct Session { + pub data: Mutex, + pub tracee: WaitCondition, + pub tracer: WaitCondition, +} +impl Session { + pub fn current() -> Option> { + PercpuBlock::current() + .ptrace_session + .borrow() + .as_ref()? + .upgrade() + } + pub fn new(file_id: usize) -> Arc { + Arc::new(Session { + data: Mutex::new(SessionData { + breakpoint: None, + events: VecDeque::new(), + file_id, + }), + tracee: WaitCondition::new(), + tracer: WaitCondition::new(), + }) + } +} + +/// Remove the session from the list of open sessions and notify any +/// waiting processes +// TODO +pub fn close_session(session: &Session, token: &mut CleanLockToken) { + session.tracer.notify(token); + session.tracee.notify(token); +} + +/// Wake up the tracer to make sure it catches on that the tracee is dead. This +/// is different from `close_session` in that it doesn't actually close the +/// session, and instead waits for the file handle to be closed, where the +/// session will *actually* be closed. This is partly to ensure ENOSRCH is +/// returned rather than ENODEV (which occurs when there's no session - should +/// never really happen). +pub fn close_tracee(session: &Session, token: &mut CleanLockToken) { + session.tracer.notify(token); + + let data = session.data.lock(); + proc_trigger_event(data.file_id, EVENT_READ, token); +} + +/// Trigger a notification to the event: scheme +fn proc_trigger_event(file_id: usize, flags: EventFlags, token: &mut CleanLockToken) { + event::trigger(GlobalSchemes::Proc.scheme_id(), file_id, flags, token); +} + +/// Dispatch an event to any tracer tracing `self`. This will cause +/// the tracer to wake up and poll for events. Returns Some(()) if an +/// event was sent. +pub fn send_event(event: PtraceEvent, token: &mut CleanLockToken) -> Option<()> { + let session = Session::current()?; + let mut data = session.data.lock(); + let breakpoint = data.breakpoint.as_ref()?; + + if event.cause & breakpoint.flags != event.cause { + return None; + } + + // Add event to queue + data.add_event(event, token); + // Notify tracer + session.tracer.notify(token); + + Some(()) +} + +// ____ _ _ _ +// | __ ) _ __ ___ __ _| | ___ __ ___ (_)_ __ | |_ ___ +// | _ \| '__/ _ \/ _` | |/ / '_ \ / _ \| | '_ \| __/ __| +// | |_) | | | __/ (_| | <| |_) | (_) | | | | | |_\__ \ +// |____/|_| \___|\__,_|_|\_\ .__/ \___/|_|_| |_|\__|___/ +// |_| + +#[derive(Debug, Clone, Copy)] +pub(crate) struct Breakpoint { + reached: bool, + pub(crate) flags: PtraceFlags, +} + +/// Wait for the tracee to stop, or return immediately if there's an unread +/// event. +/// +/// Note: Don't call while holding any locks or allocated data, this will +/// switch contexts and may in fact just never terminate. +pub fn wait(session: Arc, token: &mut CleanLockToken) -> Result<()> { + loop { + // Lock the data, to make sure we're reading the final value before going + // to sleep. + let data = session.data.lock(); + let mut token = token.downgrade(); + + // Wake up if a breakpoint is already reached or there's an unread event + if data.breakpoint.as_ref().map(|b| b.reached).unwrap_or(false) || !data.events.is_empty() { + break; + } + + // Go to sleep, and drop the lock on our data, which will allow other the + // tracer to wake us up. + if session.tracer.wait(data, "ptrace::wait", &mut token) { + // We successfully waited, wake up! + break; + } + } + + Ok(()) +} + +/// Notify the tracer and await green flag to continue. If the breakpoint was +/// set and reached, return the flags which the user waited for. Otherwise, +/// None. +/// +/// Note: Don't call while holding any locks or allocated data, this +/// will switch contexts and may in fact just never terminate. +pub fn breakpoint_callback( + match_flags: PtraceFlags, + event: Option, + token: &mut CleanLockToken, +) -> Option { + loop { + let percpu = PercpuBlock::current(); + + // TODO: Some or all flags? + // Only stop if the tracer have asked for this breakpoint + if percpu.ptrace_flags.get().contains(match_flags) { + return None; + } + + let session = percpu.ptrace_session.borrow().as_ref()?.upgrade()?; + + let mut data = session.data.lock(); + let breakpoint = data.breakpoint?; // only go to sleep if there's a breakpoint + + // In case no tracer is waiting, make sure the next one gets the memo + data.breakpoint + .as_mut() + .expect("already checked that breakpoint isn't None") + .reached = true; + + // Add event to queue + data.add_event(event.unwrap_or(ptrace_event!(match_flags)), token); + + // Wake up sleeping tracer + session.tracer.notify(token); + let mut token = token.downgrade(); + + if session + .tracee + .wait(data, "ptrace::breakpoint_callback", &mut token) + { + // We successfully waited, wake up! + break Some(breakpoint.flags); + } + } +} + +/// Obtain the next breakpoint flags for the current process. This is used for +/// detecting whether or not the tracer decided to use sysemu mode. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn next_breakpoint() -> Option { + let session = Session::current()?; + let data = session.data.lock(); + let breakpoint = data.breakpoint?; + + Some(breakpoint.flags) +} diff --git a/src/scheme/acpi.rs b/src/scheme/acpi.rs new file mode 100644 index 0000000000..87570a1297 --- /dev/null +++ b/src/scheme/acpi.rs @@ -0,0 +1,336 @@ +use alloc::{boxed::Box, vec::Vec}; +use core::convert::TryInto; + +use spin::{Mutex, Once}; +use syscall::{ + data::GlobalSchemes, + dirent::{DirEntry, DirentBuf, DirentKind}, + EIO, +}; + +use crate::{ + acpi::{RxsdtEnum, RXSDT_ENUM}, + context::file::InternalFlags, + event, + sync::{CleanLockToken, RwLock, WaitCondition, L1}, +}; + +use crate::syscall::{ + data::Stat, + error::{Error, Result, EACCES, EBADF, EBADFD, EINTR, EINVAL, EISDIR, ENOENT, ENOTDIR, EROFS}, + flag::{ + EventFlags, EVENT_READ, MODE_CHR, MODE_DIR, MODE_FILE, O_ACCMODE, O_CREAT, O_DIRECTORY, + O_EXCL, O_RDONLY, O_STAT, O_SYMLINK, + }, + usercopy::UserSliceWo, +}; + +use super::{CallerCtx, HandleMap, KernelScheme, OpenResult, SchemeExt, StrOrBytes}; + +/// A scheme used to access the RSDT or XSDT, which is needed for e.g. `acpid` to function. +pub struct AcpiScheme; + +#[derive(Clone, Copy)] +struct Handle { + kind: HandleKind, + stat: bool, +} +#[derive(Clone, Copy, Eq, PartialEq)] +enum HandleKind { + TopLevel, + Rxsdt, + ShutdownPipe, + SchemeRoot, +} + +static HANDLES: RwLock> = RwLock::new(HandleMap::new()); + +static DATA: Once> = Once::new(); + +static KSTOP_WAITCOND: WaitCondition = WaitCondition::new(); +static KSTOP_FLAG: Mutex = Mutex::new(false); + +pub fn register_kstop(token: &mut CleanLockToken) -> bool { + *KSTOP_FLAG.lock() = true; + let mut waiters_awoken = KSTOP_WAITCOND.notify(token); + + let fds: Vec = { + HANDLES + .read(token.token()) + .iter() + .filter(|(_, handle)| handle.kind == HandleKind::ShutdownPipe) + .map(|(fd, _)| *fd) + .collect() + }; + + for fd in fds { + event::trigger(GlobalSchemes::Acpi.scheme_id(), fd, EVENT_READ, token); + waiters_awoken += 1; + } + + if waiters_awoken == 0 { + error!("No userspace ACPI handler was notified when trying to shutdown. This is bad."); + // Let the kernel shutdown without ACPI. + return false; + } + + // TODO: Context switch directly to the waiting context, to avoid annoying timeouts. + true +} + +impl AcpiScheme { + pub fn init() { + // NOTE: This __must__ be called from the main kernel context, while initializing all + // schemes. If it is called by any other context, then all ACPI data will probably not even + // be mapped. + + let mut data_init = false; + + DATA.call_once(|| { + data_init = true; + + let table = match RXSDT_ENUM.get() { + Some(RxsdtEnum::Rsdt(rsdt)) => rsdt.as_slice(), + Some(RxsdtEnum::Xsdt(xsdt)) => xsdt.as_slice(), + None => { + warn!("expected RXSDT_ENUM to be initialized before AcpiScheme, is ACPI available?"); + &[] + } + }; + + Box::from(table) + }); + + if !data_init { + error!("AcpiScheme::init called multiple times"); + } + } +} + +impl KernelScheme for AcpiScheme { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + let fd = HANDLES.write(token.token()).insert(Handle { + kind: HandleKind::SchemeRoot, + stat: false, + }); + + Ok(fd) + } + fn kopenat( + &self, + id: usize, + user_buf: StrOrBytes, + flags: usize, + _fcntl_flags: u32, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + if !matches!( + HANDLES.read(token.token()).get(id)?.kind, + HandleKind::SchemeRoot + ) { + return Err(Error::new(EACCES)); + } + + let path = user_buf + .as_str() + .or(Err(Error::new(EINVAL)))? + .trim_start_matches('/'); + + if ctx.uid != 0 { + return Err(Error::new(EACCES)); + } + if flags & O_CREAT == O_CREAT { + return Err(Error::new(EROFS)); + } + if flags & O_EXCL == O_EXCL || flags & O_SYMLINK == O_SYMLINK { + return Err(Error::new(EINVAL)); + } + if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { + return Err(Error::new(EROFS)); + } + let (handle_kind, int_flags) = match path { + "" => { + if flags & O_DIRECTORY != O_DIRECTORY && flags & O_STAT != O_STAT { + return Err(Error::new(EISDIR)); + } + + (HandleKind::TopLevel, InternalFlags::POSITIONED) + } + "rxsdt" => { + if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { + return Err(Error::new(ENOTDIR)); + } + (HandleKind::Rxsdt, InternalFlags::POSITIONED) + } + "kstop" => { + if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { + return Err(Error::new(ENOTDIR)); + } + (HandleKind::ShutdownPipe, InternalFlags::empty()) + } + _ => return Err(Error::new(ENOENT)), + }; + + let fd = HANDLES.write(token.token()).insert(Handle { + kind: handle_kind, + // TODO: Redundant + stat: flags & O_STAT == O_STAT, + }); + + Ok(OpenResult::SchemeLocal(fd, int_flags)) + } + fn fsize(&self, id: usize, token: &mut CleanLockToken) -> Result { + let mut handles = HANDLES.write(token.token()); + let handle = handles.get_mut(id)?; + + if handle.stat { + return Err(Error::new(EBADF)); + } + + Ok(match handle.kind { + HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?.len() as u64, + HandleKind::ShutdownPipe => 1, + HandleKind::TopLevel => 0, + HandleKind::SchemeRoot => return Err(Error::new(EBADF))?, + }) + } + // TODO + fn fevent( + &self, + id: usize, + _flags: EventFlags, + token: &mut CleanLockToken, + ) -> Result { + let handles = HANDLES.read(token.token()); + let handle = handles.get(id)?; + + if handle.stat { + return Err(Error::new(EBADF)); + } + + Ok(EventFlags::empty()) + } + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + HANDLES.write(token.token()).remove(id)?; + Ok(()) + } + fn kreadoff( + &self, + id: usize, + dst_buf: UserSliceWo, + offset: u64, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let Ok(offset) = usize::try_from(offset) else { + return Ok(0); + }; + + let handle = *HANDLES.read(token.token()).get(id)?; + + if handle.stat { + return Err(Error::new(EBADF)); + } + + let data = match handle.kind { + HandleKind::ShutdownPipe => { + if dst_buf.is_empty() { + return Ok(0); + } + + loop { + let flag_guard = KSTOP_FLAG.lock(); + let mut token = token.downgrade(); + + if *flag_guard { + break; + } else if !KSTOP_WAITCOND.wait(flag_guard, "waiting for kstop", &mut token) { + return Err(Error::new(EINTR)); + } + } + + return dst_buf.copy_exactly(&[0x42]).map(|()| 1); + } + HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?, + HandleKind::TopLevel => return Err(Error::new(EISDIR)), + HandleKind::SchemeRoot => return Err(Error::new(EBADF)), + }; + + let src_offset = core::cmp::min(offset, data.len()); + let src_buf = data + .get(src_offset..) + .expect("expected data to be at least data.len() bytes long"); + + dst_buf.copy_common_bytes_from_slice(src_buf) + } + fn getdents( + &self, + id: usize, + buf: UserSliceWo, + header_size: u16, + opaque: u64, + token: &mut CleanLockToken, + ) -> Result { + let Handle { + kind: HandleKind::TopLevel, + .. + } = HANDLES.read(token.token()).get(id)? + else { + return Err(Error::new(ENOTDIR)); + }; + + let mut buf = DirentBuf::new(buf, header_size).ok_or(Error::new(EIO))?; + if opaque == 0 { + buf.entry(DirEntry { + kind: DirentKind::Regular, + name: "rxsdt", + inode: 0, + next_opaque_id: 1, + })?; + } + if opaque <= 1 { + buf.entry(DirEntry { + kind: DirentKind::Socket, + name: "kstop", + inode: 0, + next_opaque_id: u64::MAX, + })?; + } + Ok(buf.finalize()) + } + fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { + //TODO: construct useful path? + buf.copy_common_bytes_from_slice("/scheme/kernel.acpi/".as_bytes()) + } + fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + let handles = HANDLES.read(token.token()); + let handle = handles.get(id)?; + + buf.copy_exactly(&match handle.kind { + HandleKind::Rxsdt => { + let data = DATA.get().ok_or(Error::new(EBADFD))?; + + Stat { + st_mode: MODE_FILE, + st_size: data.len().try_into().unwrap_or(u64::MAX), + ..Default::default() + } + } + HandleKind::TopLevel => Stat { + st_mode: MODE_DIR, + st_size: 0, + ..Default::default() + }, + HandleKind::ShutdownPipe => Stat { + st_mode: MODE_CHR, + st_size: 1, + ..Default::default() + }, + HandleKind::SchemeRoot => return Err(Error::new(EBADF)), + })?; + + Ok(()) + } +} diff --git a/src/scheme/debug.rs b/src/scheme/debug.rs new file mode 100644 index 0000000000..c70ac5792b --- /dev/null +++ b/src/scheme/debug.rs @@ -0,0 +1,237 @@ +use syscall::data::GlobalSchemes; + +use crate::{ + devices::graphical_debug, + event, + log::Writer, + scheme::*, + sync::{CleanLockToken, RwLock, WaitQueue, L1}, + syscall::{ + flag::{EventFlags, EVENT_READ, O_NONBLOCK}, + usercopy::{UserSliceRo, UserSliceWo}, + }, +}; + +/// Input queue +static INPUT: WaitQueue = WaitQueue::new(); + +#[derive(Clone, Copy)] +struct Handle { + num: usize, +} + +static HANDLES: RwLock> = RwLock::new(HandleMap::new()); + +/// Add to the input queue +pub fn debug_input(data: u8, token: &mut CleanLockToken) { + INPUT.send(data, token); +} + +// Notify readers of input updates +pub fn debug_notify(token: &mut CleanLockToken) { + let ids: Vec = { HANDLES.read(token.token()).iter().map(|x| *x.0).collect() }; + for id in ids { + event::trigger(GlobalSchemes::Debug.scheme_id(), id, EVENT_READ, token); + } +} + +pub struct DebugScheme; + +#[repr(usize)] +enum SpecialFds { + Default = -1isize as usize, + NoPreserve = -2isize as usize, + DisableGraphicalDebug = -3isize as usize, + + #[cfg(feature = "profiling")] + CtlProfiling = -4isize as usize, + + SchemeRoot = -5isize as usize, + // NOTE: when adding new entries, ensure are checked correctly by the profiling code +} + +impl KernelScheme for DebugScheme { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + let id = HANDLES.write(token.token()).insert(Handle { + num: SpecialFds::SchemeRoot as usize, + }); + + Ok(id) + } + fn kopenat( + &self, + id: usize, + user_buf: StrOrBytes, + _flags: usize, + _fcntl_flags: u32, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + if HANDLES.read(token.token()).get(id)?.num != SpecialFds::SchemeRoot as usize { + return Err(Error::new(EACCES)); + } + + let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?; + if ctx.uid != 0 { + return Err(Error::new(EPERM)); + } + + let num = match path { + "" => SpecialFds::Default as usize, + + "no-preserve" => SpecialFds::NoPreserve as usize, + + "disable-graphical-debug" => SpecialFds::DisableGraphicalDebug as usize, + + #[cfg(feature = "profiling")] + p if p.starts_with("profiling-") => { + let num: u32 = path[10..].parse().map_err(|_| Error::new(ENOENT))?; + if !crate::profiling::cpu_exists(crate::cpu_set::LogicalCpuId::new(num)) { + return Err(Error::new(ENOENT)); + } + num as usize + } + + #[cfg(feature = "profiling")] + "ctl-profiling" => SpecialFds::CtlProfiling as usize, + + _ => return Err(Error::new(ENOENT)), + }; + + let id = HANDLES.write(token.token()).insert(Handle { num }); + + Ok(OpenResult::SchemeLocal(id, InternalFlags::empty())) + } + + fn fevent( + &self, + id: usize, + _flags: EventFlags, + token: &mut CleanLockToken, + ) -> Result { + let _handle = *HANDLES.read(token.token()).get(id)?; + + Ok(EventFlags::empty()) + } + + fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + let _handle = *HANDLES.read(token.token()).get(id)?; + + Ok(()) + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + HANDLES.write(token.token()).remove(id)?; + + Ok(()) + } + fn kread( + &self, + id: usize, + buf: UserSliceWo, + flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let handle = *HANDLES.read(token.token()).get(id)?; + + if handle.num == SpecialFds::DisableGraphicalDebug as usize + || handle.num == SpecialFds::SchemeRoot as usize + { + return Err(Error::new(EBADF)); + } + + #[cfg(feature = "profiling")] + if handle.num == SpecialFds::CtlProfiling as usize { + return Err(Error::new(EBADF)); + } + + // TODO: add "try_from_raw" or similar to prevent future bugs + #[cfg(feature = "profiling")] + if handle.num != SpecialFds::Default as usize + && handle.num != SpecialFds::NoPreserve as usize + { + return crate::profiling::drain_buffer( + crate::cpu_set::LogicalCpuId::new(handle.num as u32), + buf, + ); + } + + INPUT.receive_into_user( + buf, + flags & O_NONBLOCK as u32 == 0, + "DebugScheme::read", + token, + ) + } + + fn kwrite( + &self, + id: usize, + buf: UserSliceRo, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let handle = *HANDLES.read(token.token()).get(id)?; + + #[cfg(feature = "profiling")] + if handle.num == SpecialFds::CtlProfiling as usize { + let mut dst = [0]; + buf.copy_to_slice(&mut dst)?; + + let is_profiling = match dst[0] { + b'0' => false, + b'1' => true, + _ => return Err(Error::new(EINVAL)), + }; + info!("Wrote {is_profiling} to IS_PROFILING"); + crate::profiling::IS_PROFILING.store(is_profiling, Ordering::Relaxed); + + return Ok(1); + } + + if handle.num == SpecialFds::DisableGraphicalDebug as usize { + graphical_debug::fini(); + + return Ok(0); + } + + if handle.num != SpecialFds::Default as usize + && handle.num != SpecialFds::NoPreserve as usize + { + return Err(Error::new(EINVAL)); + } + + let mut tmp = [0_u8; 512]; + + for chunk in buf.in_variable_chunks(tmp.len()) { + let byte_count = chunk.copy_common_bytes_to_slice(&mut tmp)?; + let tmp_bytes = &tmp[..byte_count]; + + // The reason why a new writer is created for each iteration, is because the page fault + // handler in usercopy might use the same lock when printing for debug purposes, and + // although it most likely won't, it would be dangerous to rely on that assumption. + Writer::new().write(tmp_bytes, handle.num != SpecialFds::NoPreserve as usize); + } + + Ok(buf.len()) + } + fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result { + let handle = *HANDLES.read(token.token()).get(id)?; + if handle.num != SpecialFds::Default as usize + && handle.num != SpecialFds::NoPreserve as usize + { + return Err(Error::new(EINVAL)); + } + + // TODO: Copy elsewhere in the kernel? + const SRC: &[u8] = b"debug:"; + let byte_count = core::cmp::min(buf.len(), SRC.len()); + buf.limit(byte_count) + .expect("must succeed") + .copy_from_slice(&SRC[..byte_count])?; + + Ok(byte_count) + } +} diff --git a/src/scheme/dtb.rs b/src/scheme/dtb.rs new file mode 100644 index 0000000000..bc288dc9cc --- /dev/null +++ b/src/scheme/dtb.rs @@ -0,0 +1,163 @@ +use alloc::boxed::Box; +use spin::Once; + +use super::{CallerCtx, HandleMap, KernelScheme, OpenResult, StrOrBytes}; +use crate::{ + dtb::DTB_BINARY, + scheme::InternalFlags, + sync::{CleanLockToken, RwLock, L1}, + syscall::{ + data::Stat, + error::*, + flag::{MODE_FILE, O_STAT}, + usercopy::UserSliceWo, + }, +}; + +pub struct DtbScheme; + +#[derive(Eq, PartialEq)] +enum HandleKind { + RawData, + SchemeRoot, +} + +struct Handle { + kind: HandleKind, + stat: bool, +} + +static HANDLES: RwLock> = RwLock::new(HandleMap::new()); +static DATA: Once> = Once::new(); + +impl DtbScheme { + pub fn init() { + let mut data_init = false; + + DATA.call_once(|| { + data_init = true; + + Box::from(DTB_BINARY.get().copied().unwrap_or(&[])) + }); + + if !data_init { + error!("DtbScheme::new called multiple times"); + } + } +} + +impl KernelScheme for DtbScheme { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + let id = HANDLES.write(token.token()).insert(Handle { + kind: HandleKind::SchemeRoot, + stat: false, + }); + Ok(id) + } + fn kopenat( + &self, + id: usize, + user_buf: StrOrBytes, + _flags: usize, + _fcntl_flags: u32, + _ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + if !matches!( + HANDLES.read(token.token()).get(id)?.kind, + HandleKind::SchemeRoot + ) { + return Err(Error::new(EACCES)); + } + + let path = user_buf + .as_str() + .or(Err(Error::new(EINVAL)))? + .trim_matches('/'); + + if path.is_empty() { + let id = HANDLES.write(token.token()).insert(Handle { + kind: HandleKind::RawData, + stat: _flags & O_STAT == O_STAT, + }); + return Ok(OpenResult::SchemeLocal(id, InternalFlags::POSITIONED)); + } + + Err(Error::new(ENOENT)) + } + + fn fsize(&self, id: usize, token: &mut CleanLockToken) -> Result { + let mut handles = HANDLES.write(token.token()); + let handle = handles.get_mut(id)?; + + if handle.stat { + return Err(Error::new(EBADF)); + } + + let file_len = match handle.kind { + HandleKind::RawData => DATA.get().ok_or(Error::new(EBADFD))?.len(), + HandleKind::SchemeRoot => return Err(Error::new(EBADF)), + }; + + Ok(file_len as u64) + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + HANDLES.write(token.token()).remove(id)?; + Ok(()) + } + + fn kreadoff( + &self, + id: usize, + dst_buf: UserSliceWo, + offset: u64, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let mut handles = HANDLES.write(token.token()); + let handle = handles.get_mut(id)?; + + if handle.stat { + return Err(Error::new(EBADF)); + } + + let data = match handle.kind { + HandleKind::RawData => DATA.get().ok_or(Error::new(EBADFD))?, + HandleKind::SchemeRoot => return Err(Error::new(EBADF)), + }; + + let src_offset = core::cmp::min(offset.try_into().unwrap(), data.len()); + let src_buf = data + .get(src_offset..) + .expect("expected data to be at least data.len() bytes long"); + + dst_buf.copy_common_bytes_from_slice(src_buf) + } + + fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { + //TODO: construct useful path? + buf.copy_common_bytes_from_slice("/scheme/kernel.dtb/".as_bytes()) + } + + fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + let handles = HANDLES.read(token.token()); + let handle = handles.get(id)?; + buf.copy_exactly(&match handle.kind { + HandleKind::RawData => { + let data = DATA.get().ok_or(Error::new(EBADFD))?; + Stat { + st_mode: MODE_FILE, + st_uid: 0, + st_gid: 0, + st_size: data.len().try_into().unwrap_or(u64::MAX), + ..Default::default() + } + } + HandleKind::SchemeRoot => return Err(Error::new(EBADF)), + })?; + + Ok(()) + } +} diff --git a/src/scheme/event.rs b/src/scheme/event.rs new file mode 100644 index 0000000000..36efe5b2b0 --- /dev/null +++ b/src/scheme/event.rs @@ -0,0 +1,130 @@ +use alloc::sync::Arc; +use syscall::{EventFlags, O_NONBLOCK}; + +use crate::{ + context::file::InternalFlags, + event::{next_queue_id, queues, queues_mut, EventQueue, EventQueueId}, + sync::CleanLockToken, + syscall::{ + data::Event, + error::*, + usercopy::{UserSliceRo, UserSliceWo}, + }, +}; + +use super::{CallerCtx, KernelScheme, OpenResult, StrOrBytes}; + +const SCHEME_ROOT_ID: usize = usize::MAX; + +pub struct EventScheme; + +impl KernelScheme for EventScheme { + fn scheme_root(&self, _token: &mut CleanLockToken) -> Result { + Ok(SCHEME_ROOT_ID) + } + fn kopenat( + &self, + id: usize, + _user_buf: StrOrBytes, + _flags: usize, + _fcntl_flags: u32, + _ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + if id != SCHEME_ROOT_ID { + return Err(Error::new(EACCES)); + } + let id = next_queue_id(); + queues_mut(token.token()).insert(id, Arc::new(EventQueue::new(id))); + + Ok(OpenResult::SchemeLocal(id.get(), InternalFlags::empty())) + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + let id = EventQueueId::from(id); + let queue = queues_mut(token.token()) + .remove(&id) + .ok_or(Error::new(EBADF))?; + if let Some(queue) = Arc::into_inner(queue) { + queue.into_drop(token.downgrade()); + } + Ok(()) + } + + fn kread( + &self, + id: usize, + buf: UserSliceWo, + flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let id = EventQueueId::from(id); + + let queue = { + let handles = queues(token.token()); + let handle = handles.get(&id).ok_or(Error::new(EBADF))?; + handle.clone() + }; + + queue.read(buf, flags & O_NONBLOCK as u32 == 0, token) + } + + fn kwrite( + &self, + id: usize, + buf: UserSliceRo, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let id = EventQueueId::from(id); + + let queue = { + let handles = queues(token.token()); + let handle = handles.get(&id).ok_or(Error::new(EBADF))?; + handle.clone() + }; + let mut events_written = 0; + + for chunk in buf.in_exact_chunks(size_of::()) { + let event = unsafe { chunk.read_exact::()? }; + if queue.write(&[event], token)? == 0 { + break; + } + events_written += 1; + } + + Ok(events_written * size_of::()) + } + + fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { + buf.copy_common_bytes_from_slice(b"/scheme/event/") + } + + fn fevent( + &self, + id: usize, + flags: EventFlags, + token: &mut CleanLockToken, + ) -> Result { + let id = EventQueueId::from(id); + + let queue = { + let handles = queues(token.token()); + let handle = handles.get(&id).ok_or(Error::new(EBADF))?; + handle.clone() + }; + + let mut ready = EventFlags::empty(); + if flags.contains(EventFlags::EVENT_WRITE) { + // It is always possible to write events + ready |= EventFlags::EVENT_WRITE; + } + if flags.contains(EventFlags::EVENT_READ) && !queue.is_currently_empty(token) { + // It is possible to read if queue is not empty + ready |= EventFlags::EVENT_READ; + } + Ok(ready) + } +} diff --git a/src/scheme/irq.rs b/src/scheme/irq.rs new file mode 100644 index 0000000000..a8795e5958 --- /dev/null +++ b/src/scheme/irq.rs @@ -0,0 +1,570 @@ +// TODO: Rewrite this entire scheme. Legacy x86 APIs should be abstracted by a userspace scheme, +// this scheme should only handle raw IRQ registration and delivery to userspace. + +use alloc::{borrow::ToOwned, string::String, vec::Vec}; +use core::{ + str, + str::FromStr, + sync::atomic::{AtomicUsize, Ordering}, +}; + +use smallvec::SmallVec; +use spin::{Mutex, Once}; +use syscall::{ + data::GlobalSchemes, + dirent::{DirEntry, DirentBuf, DirentKind}, +}; + +use crate::context::file::InternalFlags; + +use super::{CallerCtx, HandleMap, OpenResult, SchemeExt, StrOrBytes}; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +use crate::arch::interrupt::{available_irqs_iter, irq::acknowledge, is_reserved, set_reserved}; +#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] +use crate::dtb::irqchip::{acknowledge, available_irqs_iter, is_reserved, set_reserved, IRQ_CHIP}; +use crate::{ + cpu_set::LogicalCpuId, + event, + sync::{CleanLockToken, RwLock, L1}, + syscall::{ + data::Stat, + error::*, + flag::{EventFlags, EVENT_READ, MODE_CHR, MODE_DIR, O_CREAT, O_DIRECTORY, O_STAT}, + usercopy::{UserSliceRo, UserSliceWo}, + }, +}; + +/// +/// IRQ queues +pub(super) static COUNTS: Mutex<[usize; 224]> = Mutex::new([0; 224]); +static HANDLES: RwLock> = RwLock::new(HandleMap::new()); + +/// These are IRQs 0..=15 (corresponding to interrupt vectors 32..=47). They are opened without the +/// O_CREAT flag. +const BASE_IRQ_COUNT: u8 = 16; + +/// These are the extended IRQs, 16..=223 (interrupt vectors 48..=255). Some of them are reserved +/// for other devices, and some other interrupt vectors like 0x80 (software interrupts) and +/// 0x40..=0x43 (IPI). +/// +/// Since these are non-sharable, they must be opened with O_CREAT, which then reserves them. They +/// are only freed when the file descriptor is closed. +const TOTAL_IRQ_COUNT: u8 = 224; + +const INO_TOPLEVEL: u64 = 0x8002_0000_0000_0000; +const INO_AVAIL: u64 = 0x8000_0000_0000_0000; +const INO_BSP: u64 = 0x8001_0000_0000_0000; +const INO_PHANDLE: u64 = 0x8003_0000_0000_0000; + +/// Add to the input queue +pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) { + COUNTS.lock()[irq as usize] += 1; + let fds: SmallVec<[usize; 8]> = { + HANDLES + .read(token.token()) + .iter() + .filter_map(|(fd, handle)| Some((fd, handle.as_irq_handle()?))) + .filter(|&(_, (_, handle_irq))| handle_irq == irq) + .map(|(f, _)| *f) + .collect() + }; + + for fd in fds { + event::trigger(GlobalSchemes::Irq.scheme_id(), fd, EVENT_READ, token); + } +} + +#[allow(dead_code)] +enum Handle { + SchemeRoot, + Irq { ack: AtomicUsize, irq: u8 }, + Avail(LogicalCpuId), + TopLevel, + Phandle(u8, Vec), + Bsp, +} +impl Handle { + fn as_irq_handle(&self) -> Option<(&AtomicUsize, u8)> { + match self { + &Self::Irq { ref ack, irq } => Some((ack, irq)), + _ => None, + } + } +} + +static CPUS: Once> = Once::new(); + +pub struct IrqScheme; + +impl IrqScheme { + pub fn init() { + let cpus = if cfg!(all( + feature = "acpi", + any(target_arch = "x86", target_arch = "x86_64") + )) { + use crate::acpi::madt::{madt, MadtEntry}; + + match madt() { + Some(madt) => madt + .iter() + .filter_map(|entry| match entry { + MadtEntry::LocalApic(apic) => Some(apic.processor), + _ => None, + }) + .collect::>(), + None => { + warn!("no MADT found, defaulting to 1 CPU"); + vec![0] + } + } + } else { + vec![0] + }; + + CPUS.call_once(|| cpus); + } + fn open_ext_irq( + flags: usize, + cpu_id: LogicalCpuId, + path_str: &str, + ) -> Result<(Handle, InternalFlags)> { + let irq_number = u8::from_str(path_str).or(Err(Error::new(ENOENT)))?; + + Ok( + if irq_number < BASE_IRQ_COUNT && cpu_id == LogicalCpuId::BSP { + // Give legacy IRQs only to `irq:{0..15}` and `irq:cpu-/{0..15}` (same handles). + // + // The only CPUs don't have the legacy IRQs in their IDTs. + + ( + Handle::Irq { + ack: AtomicUsize::new(0), + irq: irq_number, + }, + InternalFlags::empty(), + ) + } else if irq_number < TOTAL_IRQ_COUNT { + if flags & O_CREAT == 0 && flags & O_STAT == 0 { + return Err(Error::new(EINVAL)); + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + if flags & O_STAT == 0 { + if is_reserved(cpu_id, irq_to_vector(irq_number)) { + return Err(Error::new(EEXIST)); + } + set_reserved(cpu_id, irq_to_vector(irq_number), true); + } + ( + Handle::Irq { + ack: AtomicUsize::new(0), + irq: irq_number, + }, + InternalFlags::empty(), + ) + } else { + return Err(Error::new(ENOENT)); + }, + ) + } + + #[cfg(dtb)] + unsafe fn open_phandle_irq( + flags: usize, + phandle: usize, + path_str: &str, + ) -> Result<(Handle, InternalFlags)> { + unsafe { + let addr: Vec = path_str + .split(',') + .map(|x| u32::from_str(x).or(Err(Error::new(ENOENT)))) + .collect::>()?; + let ic_idx = IRQ_CHIP + .phandle_to_ic_idx(phandle as u32) + .ok_or(Error::new(ENOENT))?; + Ok({ + if flags & O_CREAT == 0 && flags & O_STAT == 0 { + return Err(Error::new(EINVAL)); + } + let irq_number = IRQ_CHIP + .irq_xlate(ic_idx, addr.as_slice()) + .or(Err(Error::new(ENOENT)))?; + debug!("open_phandle_irq virq={}", irq_number); + if flags & O_STAT == 0 { + if is_reserved(LogicalCpuId::new(0), irq_number as u8) { + return Err(Error::new(EEXIST)); + } + set_reserved(LogicalCpuId::new(0), irq_number as u8, true); + } + ( + Handle::Irq { + ack: AtomicUsize::new(0), + irq: irq_number as u8, + }, + InternalFlags::empty(), + ) + }) + } + } +} + +const fn irq_to_vector(irq: u8) -> u8 { + irq + 32 +} +const fn vector_to_irq(vector: u8) -> u8 { + vector - 32 +} + +impl crate::scheme::KernelScheme for IrqScheme { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + let id = HANDLES.write(token.token()).insert(Handle::SchemeRoot); + Ok(id) + } + fn kopenat( + &self, + id: usize, + user_buf: StrOrBytes, + flags: usize, + _fcntl_flags: u32, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + { + let handles = HANDLES.read(token.token()); + let handle = handles.get(id)?; + + if !matches!(handle, Handle::SchemeRoot) { + return Err(Error::new(EACCES)); + } + } + + let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?; + if ctx.uid != 0 { + return Err(Error::new(EACCES)); + } + + let path_str = path.trim_start_matches('/'); + + let (handle, int_flags) = if path_str.is_empty() { + if flags & O_DIRECTORY == 0 && flags & O_STAT == 0 { + return Err(Error::new(EISDIR)); + } + // list every logical CPU in the format of e.g. `cpu-1b` + + let mut bytes = String::new(); + + use core::fmt::Write; + + writeln!(bytes, "bsp").unwrap(); + + for cpu_id in CPUS.get().expect("IRQ scheme not initialized") { + writeln!(bytes, "cpu-{:02x}", cpu_id).unwrap(); + } + + #[cfg(dtb)] + unsafe { + for chip in &IRQ_CHIP.irq_chip_list.chips { + writeln!(bytes, "phandle-{}", chip.phandle).unwrap(); + } + } + + (Handle::TopLevel, InternalFlags::POSITIONED) + } else if path_str == "bsp" { + (Handle::Bsp, InternalFlags::empty()) + } else if let Some(path_str) = path_str.strip_prefix("cpu-") { + let cpu_id = u8::from_str_radix(&path_str[..2], 16).or(Err(Error::new(ENOENT)))?; + let path_str = path_str[2..].trim_end_matches('/'); + + if path_str.is_empty() { + ( + Handle::Avail(LogicalCpuId::new(cpu_id.into())), + InternalFlags::POSITIONED, + ) + } else if let Some(path_str) = path_str.strip_prefix('/') { + Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)? + } else { + return Err(Error::new(ENOENT)); + } + } else if cfg!(dtb) && path_str.starts_with("phandle-") { + #[cfg(dtb)] + unsafe { + let (phandle_str, path_str) = + path_str[8..].split_once('/').unwrap_or((path_str, "")); + let phandle = usize::from_str(phandle_str).or(Err(Error::new(ENOENT)))?; + if path_str.is_empty() { + let has_any = IRQ_CHIP.irq_iter_for(phandle as u32).next().is_some(); + if has_any { + let data = String::new(); + ( + Handle::Phandle(phandle as u8, data.into_bytes()), + InternalFlags::POSITIONED, + ) + } else { + return Err(Error::new(ENOENT)); + } + } else { + Self::open_phandle_irq(flags, phandle, path_str)? + } + } + #[cfg(not(dtb))] + panic!("") + } else if let Ok(plain_irq_number) = u8::from_str(path_str) { + if plain_irq_number < BASE_IRQ_COUNT { + ( + Handle::Irq { + ack: AtomicUsize::new(0), + irq: plain_irq_number, + }, + InternalFlags::empty(), + ) + } else { + return Err(Error::new(ENOENT)); + } + } else { + return Err(Error::new(ENOENT)); + }; + let fd = HANDLES.write(token.token()).insert(handle); + Ok(OpenResult::SchemeLocal(fd, int_flags)) + } + fn getdents( + &self, + id: usize, + buf: UserSliceWo, + header_size: u16, + opaque_id_start: u64, + token: &mut CleanLockToken, + ) -> Result { + let Ok(opaque) = usize::try_from(opaque_id_start) else { + return Ok(0); + }; + + use core::fmt::Write; + + let mut buf = DirentBuf::new(buf, header_size).ok_or(Error::new(EIO))?; + let mut intermediate = String::new(); + + match *HANDLES.read(token.token()).get(id)? { + Handle::TopLevel => { + let cpus = CPUS.get().expect("IRQ scheme not initialized"); + + if opaque == 0 { + buf.entry(DirEntry { + inode: 0, + next_opaque_id: 1, + kind: DirentKind::CharDev, + name: "bsp", + })?; + } + + // list every logical CPU in the format of e.g. `cpu-1b` + for cpu_id in cpus.iter().filter(|i| opaque <= usize::from(**i)) { + intermediate.clear(); + write!(&mut intermediate, "cpu-{:02x}", cpu_id).unwrap(); + buf.entry(DirEntry { + kind: DirentKind::Directory, + name: &intermediate, + inode: 0, + next_opaque_id: u64::from(*cpu_id + 1), + })?; + } + } + Handle::Avail(cpu_id) => { + for vector in available_irqs_iter(cpu_id).skip(opaque) { + let irq = vector_to_irq(vector); + if cpu_id == LogicalCpuId::BSP && irq < BASE_IRQ_COUNT { + continue; + } + intermediate.clear(); + write!(intermediate, "{}", irq).unwrap(); + buf.entry(DirEntry { + inode: 0, + kind: DirentKind::CharDev, + name: &intermediate, + next_opaque_id: u64::from(vector) + 1, + })?; + } + } + _ => return Err(Error::new(ENOTDIR)), + } + Ok(buf.finalize()) + } + + fn fcntl( + &self, + _id: usize, + _cmd: usize, + _arg: usize, + _token: &mut CleanLockToken, + ) -> Result { + Ok(0) + } + + fn fevent( + &self, + _id: usize, + _flags: EventFlags, + _token: &mut CleanLockToken, + ) -> Result { + Ok(EventFlags::empty()) + } + + fn fsync(&self, _file: usize, _token: &mut CleanLockToken) -> Result<()> { + Ok(()) + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + let handles_guard = HANDLES.read(token.token()); + let handle = handles_guard.get(id)?; + + if let &Handle::Irq { + irq: handle_irq, .. + } = handle + && handle_irq > BASE_IRQ_COUNT + { + set_reserved(LogicalCpuId::BSP, irq_to_vector(handle_irq), false); + } + Ok(()) + } + fn kwrite( + &self, + file: usize, + buffer: UserSliceRo, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let handles_guard = HANDLES.read(token.token()); + let handle = handles_guard.get(file)?; + + match handle { + &Handle::Irq { + irq: handle_irq, + ack: ref handle_ack, + } => { + if buffer.len() < size_of::() { + return Err(Error::new(EINVAL)); + } + let ack = buffer.read_usize()?; + let current = COUNTS.lock()[handle_irq as usize]; + + if ack != current { + return Ok(0); + } + handle_ack.store(ack, Ordering::SeqCst); + unsafe { + acknowledge(handle_irq as usize); + } + Ok(size_of::()) + } + _ => Err(Error::new(EBADF)), + } + } + + fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + let handles_guard = HANDLES.read(token.token()); + let handle = handles_guard.get(id)?; + + buf.copy_exactly(&match *handle { + Handle::Irq { + irq: handle_irq, .. + } => Stat { + st_mode: MODE_CHR | 0o600, + st_size: size_of::() as u64, + st_blocks: 1, + st_blksize: size_of::() as u32, + st_ino: handle_irq.into(), + st_nlink: 1, + ..Default::default() + }, + Handle::Bsp => Stat { + st_mode: MODE_CHR | 0o400, + st_size: size_of::() as u64, + st_blocks: 1, + st_blksize: size_of::() as u32, + st_ino: INO_BSP, + st_nlink: 1, + ..Default::default() + }, + Handle::Avail(cpu_id) => Stat { + st_mode: MODE_DIR | 0o700, + st_size: 0, + st_ino: INO_AVAIL | (u64::from(cpu_id.get()) << 32), + st_nlink: 2, + ..Default::default() + }, + Handle::Phandle(phandle, ref buf) => Stat { + st_mode: MODE_DIR | 0o700, + st_size: buf.len() as u64, + st_ino: INO_PHANDLE | (u64::from(phandle) << 32), + st_nlink: 2, + ..Default::default() + }, + Handle::TopLevel => Stat { + st_mode: MODE_DIR | 0o500, + st_size: 0, + st_ino: INO_TOPLEVEL, + st_nlink: 1, + ..Default::default() + }, + _ => return Err(Error::new(EBADF)), + })?; + + Ok(()) + } + fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result { + let handles_guard = HANDLES.read(token.token()); + let handle = handles_guard.get(id)?; + + let scheme_path = match handle { + Handle::Irq { irq, .. } => format!("irq:{}", irq), + Handle::Bsp => "irq:bsp".to_owned(), + Handle::Avail(cpu_id) => format!("irq:cpu-{:2x}", cpu_id.get()), + Handle::Phandle(phandle, _) => format!("irq:phandle-{}", phandle), + Handle::TopLevel => "irq:".to_owned(), + _ => return Err(Error::new(EBADF)), + } + .into_bytes(); + + buf.copy_common_bytes_from_slice(&scheme_path) + } + fn kreadoff( + &self, + file: usize, + buffer: UserSliceWo, + _offset: u64, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let handles_guard = HANDLES.read(token.token()); + let handle = handles_guard.get(file)?; + + match *handle { + // Ensures that the length of the buffer is larger than the size of a usize + Handle::Irq { + irq: handle_irq, + ack: ref handle_ack, + } => { + if buffer.len() < size_of::() { + return Err(Error::new(EINVAL)); + } + let current = COUNTS.lock()[handle_irq as usize]; + if handle_ack.load(Ordering::SeqCst) != current { + buffer.write_usize(current)?; + Ok(size_of::()) + } else { + Ok(0) + } + } + Handle::Bsp => { + if buffer.len() < size_of::() { + return Err(Error::new(EINVAL)); + } + buffer.write_u32(LogicalCpuId::BSP.get())?; + Ok(size_of::()) + } + Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot => { + Err(Error::new(EISDIR)) + } + } + } +} diff --git a/src/scheme/memory.rs b/src/scheme/memory.rs new file mode 100644 index 0000000000..c2f9f4747e --- /dev/null +++ b/src/scheme/memory.rs @@ -0,0 +1,326 @@ +use core::num::NonZeroUsize; + +use alloc::{sync::Arc, vec::Vec}; +use rmm::PhysicalAddress; + +use crate::{ + context::{ + file::InternalFlags, + memory::{handle_notify_files, AddrSpace, AddrSpaceWrapper, Grant, PageSpan}, + }, + memory::{free_frames, used_frames, Frame, VirtualAddress, PAGE_SIZE}, + sync::CleanLockToken, + syscall::{ + data::{Map, StatVfs}, + error::*, + flag::MapFlags, + usercopy::{UserSliceRw, UserSliceWo}, + }, +}; + +use super::{CallerCtx, KernelScheme, OpenResult, StrOrBytes}; + +pub struct MemoryScheme; + +// TODO: Use crate that autogenerates conversion functions. +#[repr(u8)] +#[derive(Clone, Copy, Debug, PartialEq)] +enum HandleTy { + Allocated = 0, + PhysBorrow = 1, + Translation = 2, +} +#[repr(u8)] +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum MemoryType { + Writeback = 0, + Uncacheable = 1, + WriteCombining = 2, + DeviceMemory = 3, +} + +bitflags! { + struct HandleFlags: u16 { + // TODO: below 32 bits? + const PHYS_CONTIGUOUS = 1; + } +} + +fn from_raw(raw: u32) -> Option<(HandleTy, MemoryType, HandleFlags)> { + Some(( + match raw & 0xFF { + 0 => HandleTy::Allocated, + 1 => HandleTy::PhysBorrow, + 2 => HandleTy::Translation, + + _ => return None, + }, + match (raw >> 8) & 0xFF { + 0 => MemoryType::Writeback, + 1 => MemoryType::Uncacheable, + 2 => MemoryType::WriteCombining, + 3 => MemoryType::DeviceMemory, + + _ => return None, + }, + HandleFlags::from_bits_truncate((raw >> 16) as u16), + )) +} + +impl MemoryScheme { + pub fn fmap_anonymous( + addr_space: &Arc, + map: &Map, + is_phys_contiguous: bool, + token: &mut CleanLockToken, + ) -> Result { + let span = PageSpan::validate_nonempty(VirtualAddress::new(map.address), map.size) + .ok_or(Error::new(EINVAL))?; + let page_count = NonZeroUsize::new(span.count).ok_or(Error::new(EINVAL))?; + + let mut notify_files = Vec::new(); + + if is_phys_contiguous && map.flags.contains(MapFlags::MAP_SHARED) { + // TODO: Should this be supported? + return Err(Error::new(EOPNOTSUPP)); + } + + let fixed = map.flags.contains(MapFlags::MAP_FIXED) + || map.flags.contains(MapFlags::MAP_FIXED_NOREPLACE); + + let mut lock_token = token.token(); + let page = addr_space.acquire_write(lock_token.downgrade()).mmap( + addr_space, + (map.address != 0 || fixed).then_some(span.base), + page_count, + map.flags, + Some(&mut notify_files), + |dst_page, flags, mapper, flusher| { + let span = PageSpan::new(dst_page, page_count.get()); + if is_phys_contiguous { + Ok(Grant::zeroed_phys_contiguous(span, flags, mapper, flusher)?) + } else { + Ok(Grant::zeroed( + span, + flags, + mapper, + flusher, + map.flags.contains(MapFlags::MAP_SHARED), + )?) + } + }, + )?; + + handle_notify_files(notify_files, token); + + Ok(page.start_address().data()) + } + pub fn physmap( + physical_address: usize, + size: usize, + flags: MapFlags, + memory_type: MemoryType, + token: &mut CleanLockToken, + ) -> Result { + // TODO: Check physical_address against the real MAXPHYADDR. + let end = 1 << 52; + if (physical_address.saturating_add(size) as u64) > end + || !physical_address.is_multiple_of(PAGE_SIZE) + { + return Err(Error::new(EINVAL)); + } + + if !size.is_multiple_of(PAGE_SIZE) { + warn!( + "physmap size {} is not multiple of PAGE_SIZE {}", + size, PAGE_SIZE + ); + return Err(Error::new(EINVAL)); + } + let page_count = NonZeroUsize::new(size.div_ceil(PAGE_SIZE)).ok_or(Error::new(EINVAL))?; + + let current_addrsp = AddrSpace::current()?; + + let mut lock_token = token.token(); + let base_page = current_addrsp + .acquire_write(lock_token.downgrade()) + .mmap_anywhere( + ¤t_addrsp, + page_count, + flags, + |dst_page, mut page_flags, dst_mapper, dst_flusher| { + match memory_type { + // Default + MemoryType::Writeback => (), + + MemoryType::WriteCombining => page_flags = page_flags.write_combining(true), + MemoryType::Uncacheable => page_flags = page_flags.uncacheable(true), + MemoryType::DeviceMemory => page_flags = page_flags.device_memory(true), + } + + Grant::physmap( + Frame::containing(PhysicalAddress::new(physical_address)), + PageSpan::new(dst_page, page_count.get()), + page_flags, + dst_mapper, + dst_flusher, + ) + }, + )?; + Ok(base_page.start_address().data()) + } +} + +const SCHEME_ROOT_ID: usize = usize::MAX; + +impl KernelScheme for MemoryScheme { + fn scheme_root(&self, _token: &mut CleanLockToken) -> Result { + Ok(SCHEME_ROOT_ID) + } + fn kopenat( + &self, + id: usize, + user_buf: StrOrBytes, + _flags: usize, + _fcntl_flags: u32, + ctx: CallerCtx, + _token: &mut CleanLockToken, + ) -> Result { + if id != SCHEME_ROOT_ID { + return Err(Error::new(EACCES)); + } + let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?; + if path.len() > 64 { + return Err(Error::new(ENOENT)); + } + let path = path.trim_start_matches('/'); + + let (before_memty, memty_str) = path.split_once('@').unwrap_or((path, "")); + let (before_ty, type_str) = memty_str.split_once('?').unwrap_or((memty_str, "")); + + let handle_ty = match before_memty { + "" | "zeroed" => HandleTy::Allocated, + "physical" => HandleTy::PhysBorrow, + "translation" => HandleTy::Translation, + "scheme-root" => { + return Ok(OpenResult::SchemeLocal( + SCHEME_ROOT_ID, + InternalFlags::empty(), + )) + } + + _ => return Err(Error::new(ENOENT)), + }; + let mem_ty = match before_ty { + "" | "wb" => MemoryType::Writeback, + "wc" => MemoryType::WriteCombining, + "uc" => MemoryType::Uncacheable, + "dev" => MemoryType::DeviceMemory, + + _ => return Err(Error::new(ENOENT)), + }; + + let flags = type_str + .split(',') + .filter_map(|ty_str| match ty_str { + //"32" => HandleFlags::BELOW_4G, + "phys_contiguous" => Some(Some(HandleFlags::PHYS_CONTIGUOUS)), + "" => None, + _ => Some(None), + }) + .collect::>() + .ok_or(Error::new(ENOENT))?; + + // TODO: Support arches with other default memory types? + if ctx.uid != 0 + && (!flags.is_empty() + || !matches!( + (handle_ty, mem_ty), + (HandleTy::Allocated, MemoryType::Writeback) + )) + { + return Err(Error::new(EACCES)); + } + + Ok(OpenResult::SchemeLocal( + (handle_ty as usize) | ((mem_ty as usize) << 8) | (usize::from(flags.bits()) << 16), + InternalFlags::empty(), + )) + } + fn kcall( + &self, + id: usize, + payload: UserSliceRw, + _flags: syscall::CallFlags, + _metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + let (handle_ty, _, _) = u32::try_from(id) + .ok() + .and_then(from_raw) + .ok_or(Error::new(EBADF))?; + + match handle_ty { + HandleTy::Translation => { + let virt = VirtualAddress::new(payload.read_usize()?); + let mut token = token.token(); + let addr = AddrSpace::current()?; + let addr = addr.acquire_read(token.downgrade()); + let (phys, _) = addr + .table + .utable + .translate(virt) + .ok_or(Error::new(ENOENT))?; + payload.write_usize(phys.data())?; + + // could just return address directly, but physaddrs might conflict with the bit + // patterns reserved for error codes + Ok(0) + } + HandleTy::Allocated | HandleTy::PhysBorrow => Err(Error::new(EOPNOTSUPP)), + } + } + + fn kfmap( + &self, + id: usize, + addr_space: &Arc, + map: &Map, + _consume: bool, + token: &mut CleanLockToken, + ) -> Result { + let (handle_ty, mem_ty, flags) = u32::try_from(id) + .ok() + .and_then(from_raw) + .ok_or(Error::new(EBADF))?; + + match handle_ty { + HandleTy::Allocated => Self::fmap_anonymous( + addr_space, + map, + flags.contains(HandleFlags::PHYS_CONTIGUOUS), + token, + ), + HandleTy::PhysBorrow => Self::physmap(map.offset, map.size, map.flags, mem_ty, token), + HandleTy::Translation => Err(Error::new(EOPNOTSUPP)), + } + } + fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { + //TODO: construct useful path? + buf.copy_common_bytes_from_slice("/scheme/memory/".as_bytes()) + } + fn kfstatvfs(&self, _file: usize, dst: UserSliceWo, _token: &mut CleanLockToken) -> Result<()> { + let used = used_frames() as u64; + let free = free_frames() as u64; + + let stat = StatVfs { + f_bsize: PAGE_SIZE.try_into().map_err(|_| Error::new(EOVERFLOW))?, + f_blocks: used + free, + f_bfree: free, + f_bavail: free, + }; + dst.copy_exactly(&stat)?; + + Ok(()) + } +} diff --git a/src/scheme/mod.rs b/src/scheme/mod.rs new file mode 100644 index 0000000000..9da2b28220 --- /dev/null +++ b/src/scheme/mod.rs @@ -0,0 +1,795 @@ +//! # Schemes +//! A scheme is a primitive for handling filesystem syscalls in Redox. +//! Schemes accept paths from the kernel for `open`, and file descriptors that they generate +//! are then passed for operations like `close`, `read`, `write`, etc. +//! +//! The kernel validates paths and file descriptors before they are passed to schemes, +//! also stripping the scheme identifier of paths if necessary. + +// TODO: Move handling of the global namespace to userspace. + +use alloc::{ + sync::{Arc, Weak}, + vec::Vec, +}; +use core::{ + str, + sync::atomic::{AtomicUsize, Ordering}, +}; +use hashbrown::hash_map::{self, DefaultHashBuilder, HashMap}; +use spin::Once; +use syscall::{ + data::{GlobalSchemes, NewFdParams, StdFsCallMeta}, + error::*, + CallFlags, EventFlags, MunmapFlags, StdFsCallKind, +}; + +use crate::{ + context::{ + self, + file::{FileDescription, InternalFlags, LockedFileDescription}, + memory::AddrSpaceWrapper, + ContextLock, + }, + sync::{CleanLockToken, LockToken, RwLock, L0, L1}, + syscall::usercopy::{UserSliceRo, UserSliceRw, UserSliceWo}, +}; + +use self::{acpi::AcpiScheme, dtb::DtbScheme}; + +use self::{ + debug::DebugScheme, + event::EventScheme, + irq::IrqScheme, + memory::MemoryScheme, + pipe::PipeScheme, + proc::ProcScheme, + serio::SerioScheme, + sys::SysScheme, + time::TimeScheme, + user::{UserInner, UserScheme}, +}; + +/// When compiled with the "acpi" feature - `acpi:` - allows drivers to read a limited set of ACPI tables. +pub mod acpi; + +pub mod dtb; + +/// `debug:` - provides access to serial console +pub mod debug; + +/// `event:` - allows reading of `Event`s which are registered using `fevent` +pub mod event; + +/// `irq:` - allows userspace handling of IRQs +pub mod irq; + +/// `memory:` - a scheme for accessing physical memory +pub mod memory; + +/// `pipe:` - used internally by the kernel to implement `pipe` +pub mod pipe; + +/// `proc:` - allows tracing processes and reading/writing their memory +pub mod proc; + +/// `serio:` - provides access to ps/2 devices +pub mod serio; + +/// `sys:` - system information, such as the context list and scheme list +pub mod sys; + +/// `time:` - allows reading time, setting timeouts and getting events when they are met +pub mod time; + +/// A wrapper around userspace schemes, tightly dependent on `root` +pub mod user; + +/// Limit on number of schemes +pub const SCHEME_MAX_SCHEMES: usize = 65_536; + +// Unique identifier for a scheme. +int_like!(SchemeId, usize); + +// Unique identifier for a file descriptor. +int_like!(FileHandle, AtomicFileHandle, usize, AtomicUsize); + +#[allow(dead_code)] +pub enum StrOrBytes<'a> { + Str(&'a str), + Bytes(&'a [u8]), +} + +#[allow(dead_code)] +impl<'a> StrOrBytes<'a> { + pub fn as_str(&self) -> Result<&str, core::str::Utf8Error> { + match self { + StrOrBytes::Str(path) => Ok(path), + StrOrBytes::Bytes(slice) => core::str::from_utf8(slice), + } + } + + pub fn as_bytes(&self) -> &[u8] { + match self { + StrOrBytes::Str(path) => path.as_bytes(), + StrOrBytes::Bytes(slice) => slice, + } + } + + pub fn from_str(path: &'a str) -> Self { + StrOrBytes::Str(path) + } + + pub fn from_bytes(slice: &'a [u8]) -> Self { + StrOrBytes::Bytes(slice) + } +} + +struct HandleMap { + handles: HashMap, + next_id: usize, +} + +impl HandleMap { + const fn new() -> Self { + HandleMap { + handles: HashMap::with_hasher(DefaultHashBuilder::new()), + next_id: 1, + } + } + + fn insert(&mut self, handle: T) -> usize { + let id = self.next_id; + self.next_id += 1; + self.handles.insert(id, handle); + id + } + + fn remove(&mut self, id: usize) -> Result { + self.handles.remove(&id).ok_or(Error::new(EBADF)) + } + + fn get(&self, id: usize) -> Result<&T> { + self.handles.get(&id).ok_or(Error::new(EBADF)) + } + + fn get_mut(&mut self, id: usize) -> Result<&mut T> { + self.handles.get_mut(&id).ok_or(Error::new(EBADF)) + } + + fn iter(&self) -> hash_map::Iter<'_, usize, T> { + self.handles.iter() + } +} + +enum Handle { + SchemeCreationCapability, + Scheme(KernelSchemes), +} + +/// Schemes list +static HANDLES: Once>> = Once::new(); +static SCHEME_LIST_NEXT_ID: AtomicUsize = AtomicUsize::new(MAX_GLOBAL_SCHEMES); +static SCHEME_LIST_ID: AtomicUsize = AtomicUsize::new(0); + +/// Initialize schemes, called if needed +fn init_schemes() -> RwLock> { + let mut handles = HashMap::new(); + let mut insert_globals = |globals: &[GlobalSchemes]| { + for &g in globals { + handles.insert( + SchemeId::from(g as usize), + Handle::Scheme(KernelSchemes::Global(g)), + ); + } + }; + + // TODO: impl TryFrom and bypass map for global schemes? + { + use GlobalSchemes::*; + insert_globals(&[Debug, Event, Memory, Pipe, Serio, Irq, Time, Sys, Proc]); + + if cfg!(feature = "acpi") { + insert_globals(&[Acpi]); + } + + if cfg!(dtb) { + insert_globals(&[Dtb]); + } + } + let next_id = SCHEME_LIST_NEXT_ID.fetch_add(1, Ordering::Relaxed); + handles.insert(SchemeId(next_id), Handle::Scheme(KernelSchemes::SchemeMgr)); + SCHEME_LIST_ID.store(next_id, Ordering::Relaxed); + + RwLock::new(handles) +} + +/// Get a handle to a scheme. +pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result { + match handles().read(token).get(&scheme_id) { + Some(Handle::Scheme(scheme)) => Ok(scheme.clone()), + _ => Err(Error::new(ENODEV)), + } +} + +fn handles<'a>() -> &'a RwLock> { + HANDLES.call_once(init_schemes) +} + +/// Scheme list type +pub struct SchemeList; + +impl SchemeList { + /// Get the id of the scheme list + pub fn id(&self) -> SchemeId { + SchemeId(SCHEME_LIST_ID.load(Ordering::Relaxed)) + } + + /// Get the UserInner + fn get_user_inner(&self, id: usize, token: &mut CleanLockToken) -> Option> { + match handles().read(token.token()).get(&SchemeId(id)) { + Some(Handle::Scheme(KernelSchemes::User(UserScheme { inner }))) => Some(inner.clone()), + _ => None, + } + } + + /// Create a new scheme. + fn insert(&self, context: Weak, token: &mut CleanLockToken) -> Result { + let mut handles = handles().write(token.token()); + let id = loop { + let mut id = SCHEME_LIST_NEXT_ID.fetch_add(1, Ordering::Relaxed); + + if id >= SCHEME_MAX_SCHEMES { + id = 1; + SCHEME_LIST_NEXT_ID.store(id, Ordering::Relaxed); + } + + let id = SchemeId(id); + + if !handles.contains_key(&id) { + break id; + } + }; + + let root_id = SchemeId(SCHEME_LIST_ID.load(Ordering::Relaxed)); + let inner = Arc::new(UserInner::new(root_id, id, context)); + let new_scheme = Handle::Scheme(KernelSchemes::User(UserScheme::new(inner))); + assert!(handles.insert(id, new_scheme).is_none()); + Ok(id) + } + + /// Remove a scheme + fn remove(&self, id: usize, token: &mut CleanLockToken) { + let scheme = handles().write(token.token()).remove(&SchemeId(id)); + + assert!(scheme.is_some()); + if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme + && let Some(user) = Arc::into_inner(user.inner) + { + user.into_drop(token); + } + } +} + +impl KernelScheme for SchemeList { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + let id = SchemeId(0); + handles() + .write(token.token()) + .insert(id, Handle::SchemeCreationCapability); + Ok(id.get()) + } + fn kdup( + &self, + scheme_id: usize, + user_buf: UserSliceRo, + caller: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + let scheme_id = SchemeId(scheme_id); + match handles() + .read(token.token()) + .get(&scheme_id) + .ok_or(Error::new(EBADF))? + { + Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => { + let inner = inner.clone(); + assert!(scheme_id == inner.scheme_id); + let scheme = scheme_id; + let params = unsafe { user_buf.read_exact::()? }; + + return Ok(OpenResult::External(Arc::new(RwLock::new( + FileDescription { + scheme, + number: params.number, + offset: params.offset, + flags: params.flags as u32, + internal_flags: InternalFlags::from_extra0(params.internal_flags) + .ok_or(Error::new(EINVAL))?, + }, + )))); + } + Handle::SchemeCreationCapability => (), + _ => return Err(Error::new(EBADF)), + }; + + const EXPECTED: &[u8] = b"create-scheme"; + let mut buf = [0u8; EXPECTED.len()]; + + if user_buf.copy_common_bytes_to_slice(&mut buf)? < EXPECTED.len() || buf != *EXPECTED { + return Err(Error::new(EINVAL)); + } + + if caller.uid != 0 { + return Err(Error::new(EACCES)); + }; + + let context = Arc::downgrade(&context::current()); + + let scheme_id = self.insert(context, token)?; + Ok(OpenResult::SchemeLocal( + scheme_id.get(), + InternalFlags::empty(), + )) + } + + fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { + buf.copy_common_bytes_from_slice("/scheme".as_bytes()) + } + + fn fevent( + &self, + id: usize, + flags: EventFlags, + token: &mut CleanLockToken, + ) -> Result { + match self.get_user_inner(id, token) { + Some(inner) => inner.fevent(flags, token), + _ => Err(Error::new(EBADF)), + } + } + + fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + match self.get_user_inner(id, token) { + Some(inner) => inner.fsync(), + None => Err(Error::new(EBADF)), + } + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + self.remove(id, token); + Ok(()) + } + + fn kreadoff( + &self, + id: usize, + buf: UserSliceWo, + _offset: u64, + flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + match self.get_user_inner(id, token) { + Some(inner) => inner.read(buf, flags, token), + None => Err(Error::new(EBADF)), + } + } + + fn kwrite( + &self, + id: usize, + buf: UserSliceRo, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + match self.get_user_inner(id, token) { + Some(inner) => inner.write(buf, token), + None => Err(Error::new(EBADF)), + } + } + + fn kfdwrite( + &self, + id: usize, + descs: Vec>, + flags: CallFlags, + arg: u64, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + match self.get_user_inner(id, token) { + Some(inner) => inner.call_fdwrite(descs, flags, arg, metadata, token), + None => Err(Error::new(EBADF)), + } + } + + fn kfdread( + &self, + id: usize, + payload: UserSliceRw, + flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + match self.get_user_inner(id, token) { + Some(inner) => inner.call_fdread(payload, flags, metadata, token), + None => Err(Error::new(EBADF)), + } + } +} + +#[derive(Clone)] +pub enum KernelSchemes { + SchemeMgr, + User(UserScheme), + Global(GlobalSchemes), +} + +impl core::ops::Deref for KernelSchemes { + type Target = dyn KernelScheme; + + fn deref(&self) -> &Self::Target { + match self { + Self::SchemeMgr => &SchemeList, + Self::User(scheme) => scheme, + + Self::Global(global) => global.as_scheme(), + } + } +} + +pub const ALL_KERNEL_SCHEMES: &[GlobalSchemes] = &[ + GlobalSchemes::Debug, + GlobalSchemes::Event, + GlobalSchemes::Memory, + GlobalSchemes::Pipe, + GlobalSchemes::Serio, + GlobalSchemes::Irq, + GlobalSchemes::Time, + GlobalSchemes::Sys, + GlobalSchemes::Proc, + GlobalSchemes::Acpi, + GlobalSchemes::Dtb, +]; + +pub const MAX_GLOBAL_SCHEMES: usize = 16; +pub const KERNEL_SCHEMES_COUNT: usize = ALL_KERNEL_SCHEMES.len(); +const _: () = { + assert!(1 + KERNEL_SCHEMES_COUNT < MAX_GLOBAL_SCHEMES); +}; + +pub trait SchemeExt { + fn as_scheme(&self) -> &dyn KernelScheme; + fn scheme_id(self) -> SchemeId; +} +impl SchemeExt for GlobalSchemes { + fn as_scheme(&self) -> &dyn KernelScheme { + match self { + Self::Debug => &DebugScheme, + Self::Event => &EventScheme, + Self::Memory => &MemoryScheme, + Self::Pipe => &PipeScheme, + Self::Serio => &SerioScheme, + Self::Irq => &IrqScheme, + Self::Time => &TimeScheme, + Self::Sys => &SysScheme, + Self::Proc => &ProcScheme, + Self::Acpi => &AcpiScheme, + Self::Dtb => &DtbScheme, + } + } + fn scheme_id(self) -> SchemeId { + SchemeId::new(self as usize) + } +} + +#[cold] +pub fn init_globals() { + if cfg!(feature = "acpi") { + AcpiScheme::init(); + } + DtbScheme::init(); + IrqScheme::init(); +} + +#[allow(unused_variables)] +pub trait KernelScheme: Send + Sync + 'static { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + Err(Error::new(EOPNOTSUPP)) + } + + fn kopenat( + &self, + file: usize, + path: StrOrBytes, + flags: usize, + fcntl_flags: u32, + _ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + Err(Error::new(EOPNOTSUPP)) + } + + fn kfmap( + &self, + number: usize, + addr_space: &Arc, + map: &crate::syscall::data::Map, + consume: bool, + token: &mut CleanLockToken, + ) -> Result { + Err(Error::new(EOPNOTSUPP)) + } + fn kfunmap( + &self, + number: usize, + offset: usize, + size: usize, + flags: MunmapFlags, + token: &mut CleanLockToken, + ) -> Result<()> { + Err(Error::new(EOPNOTSUPP)) + } + + fn kdup( + &self, + old_id: usize, + buf: UserSliceRo, + _caller: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + Err(Error::new(EOPNOTSUPP)) + } + fn kwriteoff( + &self, + id: usize, + buf: UserSliceRo, + offset: u64, + flags: u32, + stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + if offset != u64::MAX { + return Err(Error::new(ESPIPE)); + } + self.kwrite(id, buf, flags, stored_flags, token) + } + fn kreadoff( + &self, + id: usize, + buf: UserSliceWo, + offset: u64, + flags: u32, + stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + if offset != u64::MAX { + return Err(Error::new(ESPIPE)); + } + self.kread(id, buf, flags, stored_flags, token) + } + fn kwrite( + &self, + id: usize, + buf: UserSliceRo, + flags: u32, + stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + Err(Error::new(EBADF)) + } + fn kread( + &self, + id: usize, + buf: UserSliceWo, + flags: u32, + stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + Err(Error::new(EBADF)) + } + fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result; + fn kfutimens(&self, id: usize, buf: UserSliceRo, token: &mut CleanLockToken) -> Result { + Err(Error::new(EBADF)) + } + fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + Err(Error::new(EBADF)) + } + fn kfstatvfs(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + Err(Error::new(EBADF)) + } + + fn getdents( + &self, + id: usize, + buf: UserSliceWo, + header_size: u16, + opaque_id_first: u64, + token: &mut CleanLockToken, + ) -> Result { + Err(Error::new(EOPNOTSUPP)) + } + + fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + Ok(()) + } + fn ftruncate(&self, id: usize, len: usize, token: &mut CleanLockToken) -> Result<()> { + Err(Error::new(EBADF)) + } + fn fsize(&self, id: usize, token: &mut CleanLockToken) -> Result { + Err(Error::new(ESPIPE)) + } + fn fchmod(&self, id: usize, new_mode: u16, token: &mut CleanLockToken) -> Result<()> { + Err(Error::new(EBADF)) + } + fn fchown( + &self, + id: usize, + new_uid: u32, + new_gid: u32, + token: &mut CleanLockToken, + ) -> Result<()> { + Err(Error::new(EBADF)) + } + fn fevent( + &self, + id: usize, + flags: EventFlags, + token: &mut CleanLockToken, + ) -> Result { + Ok(EventFlags::empty()) + } + fn flink( + &self, + id: usize, + new_path: &str, + caller_ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result<()> { + Err(Error::new(EBADF)) + } + fn frename( + &self, + id: usize, + new_path: &str, + caller_ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result<()> { + Err(Error::new(EBADF)) + } + fn fcntl( + &self, + id: usize, + cmd: usize, + arg: usize, + token: &mut CleanLockToken, + ) -> Result { + Ok(0) + } + fn unlinkat( + &self, + file: usize, + path: &str, + flags: usize, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result<()> { + Err(Error::new(ENOENT)) + } + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + Ok(()) + } + fn detach(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + Ok(()) + } + fn kcall( + &self, + id: usize, + payload: UserSliceRw, + flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + Err(Error::new(EOPNOTSUPP)) + } + fn kstdfscall( + &self, + id: usize, + kind: StdFsCallKind, + desc: Arc, + payload: UserSliceRw, + flags: CallFlags, + metadata: StdFsCallMeta, + token: &mut CleanLockToken, + ) -> Result { + Err(Error::new(EOPNOTSUPP)) + } + fn kfdwrite( + &self, + id: usize, + descs: Vec>, + flags: CallFlags, + args: u64, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + Err(Error::new(EOPNOTSUPP)) + } + fn kfdread( + &self, + id: usize, + payload: UserSliceRw, + flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + Err(Error::new(EOPNOTSUPP)) + } + + fn translate_std_fs_call( + &self, + id: usize, + desc: Arc, + payload: UserSliceRw, + flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + let &[kind, arg1, arg2, ..] = metadata else { + return Err(Error::new(EINVAL)); + }; + let Some(kind) = StdFsCallKind::try_from_raw(kind as u8) else { + return Err(Error::new(EOPNOTSUPP)); + }; + let metadata = StdFsCallMeta::new(kind, arg1, arg2); + use syscall::flag::StdFsCallKind::*; + match kind { + Fchmod => self.fchmod(id, metadata.arg1 as u16, token).map(|_| 0), + Getdents => self.getdents( + id, + payload.into_wo()?, + metadata.arg2 as u16, + metadata.arg1, + token, + ), + Fstat => self.kfstat(id, payload.into_wo()?, token).map(|_| 0), + Fstatvfs => self.kfstatvfs(id, payload.into_wo()?, token).map(|_| 0), + Fsync => self.fsync(id, token).map(|_| 0), + Ftruncate => self.ftruncate(id, metadata.arg1 as usize, token).map(|_| 0), + Futimens => self.kfutimens(id, payload.into_ro()?, token), + /* TODO: Support Fchown and Unlinkat using std_fs_call + Fchown => self.kstdfscall(id, kind, desc, payload, flags, metadata, token), + Unlinkat => self.kstdfscall(fd, kind, payload, metadata, &caller).map(|_| 0) + */ + _ => Err(Error::new(EOPNOTSUPP)), + } + } +} + +#[derive(Debug)] +pub enum OpenResult { + SchemeLocal(usize, InternalFlags), + External(Arc), +} +pub struct CallerCtx { + pub pid: usize, + pub uid: u32, + pub gid: u32, + pub groups: alloc::vec::Vec, +} +impl CallerCtx { + pub fn filter_uid_gid(self, euid: u32, egid: u32) -> Self { + if self.uid == 0 && self.gid == 0 { + Self { + pid: self.pid, + uid: euid, + gid: egid, + groups: self.groups, + } + } else { + self + } + } +} diff --git a/src/scheme/pipe.rs b/src/scheme/pipe.rs new file mode 100644 index 0000000000..df5db9d908 --- /dev/null +++ b/src/scheme/pipe.rs @@ -0,0 +1,521 @@ +use alloc::{collections::VecDeque, sync::Arc, vec::Vec}; +use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; + +use syscall::{data::GlobalSchemes, CallFlags}; + +use hashbrown::{hash_map::DefaultHashBuilder, HashMap}; + +use crate::{ + context::{ + context::{bulk_add_fds, bulk_insert_fds}, + file::{InternalFlags, LockedFileDescription}, + }, + event, + sync::{CleanLockToken, Mutex, RwLock, WaitCondition, L1}, + syscall::{ + data::Stat, + error::{Error, Result, EAGAIN, EBADF, EINTR, EINVAL, ENOENT, EPIPE}, + flag::{EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_NONBLOCK}, + usercopy::{UserSliceRo, UserSliceRw, UserSliceWo}, + }, +}; + +use super::{CallerCtx, KernelScheme, OpenResult, SchemeExt, StrOrBytes}; + +// TODO: Preallocate a number of scheme IDs, since there can only be *one* root namespace, and +// therefore only *one* pipe scheme. +static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(0); + +enum Handle { + Pipe(Arc), + SchemeRoot, +} + +// TODO: SLOB? +static PIPES: RwLock> = + RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); + +const MAX_QUEUE_SIZE: usize = 65536; + +// In almost all places where Rust (and LLVM) uses pointers, they are limited to nonnegative isize, +// so this is fine. +const WRITE_NOT_READ_BIT: usize = 1; + +fn from_raw_id(id: usize) -> (bool, usize) { + (id & WRITE_NOT_READ_BIT != 0, id & !WRITE_NOT_READ_BIT) +} + +pub fn pipe(token: &mut CleanLockToken) -> Result<(usize, usize)> { + // Bit 0 is used for WRITE_NOT_READ_BIT + let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed); + + PIPES.write(token.token()).insert( + id, + Handle::Pipe(Arc::new(Pipe { + queue: Mutex::new(VecDeque::new()), + read_condition: WaitCondition::new(), + write_condition: WaitCondition::new(), + writer_is_alive: AtomicBool::new(true), + reader_is_alive: AtomicBool::new(true), + has_run_dup: AtomicBool::new(false), + fd_queue: Mutex::new(VecDeque::new()), + })), + ); + + Ok((id, id | WRITE_NOT_READ_BIT)) +} + +pub struct PipeScheme; + +impl PipeScheme { + fn get_pipe(key: usize, token: &mut CleanLockToken) -> Result> { + PIPES + .read(token.token()) + .get(&key) + .and_then(|handle| match handle { + Handle::Pipe(pipe) => Some(Arc::clone(pipe)), + _ => None, + }) + .ok_or(Error::new(EBADF)) + } +} + +impl KernelScheme for PipeScheme { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed); + PIPES.write(token.token()).insert(id, Handle::SchemeRoot); + Ok(id) + } + fn fevent( + &self, + id: usize, + flags: EventFlags, + token: &mut CleanLockToken, + ) -> Result { + let (is_writer_not_reader, key) = from_raw_id(id); + let pipe = Self::get_pipe(key, token)?; + + let mut ready = EventFlags::empty(); + + if is_writer_not_reader + && flags.contains(EVENT_WRITE) + && (pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE + || !pipe.reader_is_alive.load(Ordering::Acquire)) + { + ready |= EventFlags::EVENT_WRITE; + } + if !is_writer_not_reader + && flags.contains(EVENT_READ) + && (!pipe.queue.lock(token.token()).is_empty() + || !pipe.writer_is_alive.load(Ordering::Acquire)) + { + ready |= EventFlags::EVENT_READ; + } + + Ok(ready) + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + let (is_write_not_read, key) = from_raw_id(id); + + let pipe = Self::get_pipe(key, token)?; + let scheme_id = GlobalSchemes::Pipe.scheme_id(); + + let can_remove = if is_write_not_read { + pipe.writer_is_alive.store(false, Ordering::SeqCst); + event::trigger(scheme_id, key, EVENT_READ, token); + pipe.read_condition.notify(token); + + !pipe.reader_is_alive.load(Ordering::SeqCst) + } else { + pipe.reader_is_alive.store(false, Ordering::SeqCst); + event::trigger(scheme_id, key | WRITE_NOT_READ_BIT, EVENT_WRITE, token); + pipe.write_condition.notify(token); + + !pipe.writer_is_alive.load(Ordering::SeqCst) + }; + + if can_remove { + let handle = PIPES.write(token.token()).remove(&key); + if let Some(Handle::Pipe(pipe)) = handle + && let Some(pipe) = Arc::into_inner(pipe) + { + { + pipe.read_condition.into_drop(token); + } + { + pipe.write_condition.into_drop(token); + } + } + } + + if let Some(pipe) = Arc::into_inner(pipe) { + { + pipe.read_condition.into_drop(token); + } + { + pipe.write_condition.into_drop(token); + } + } + + Ok(()) + } + + fn kdup( + &self, + old_id: usize, + user_buf: UserSliceRo, + _ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + let (is_writer_not_reader, key) = from_raw_id(old_id); + + if is_writer_not_reader { + return Err(Error::new(EBADF)); + } + + let mut buf = [0_u8; 5]; + + if user_buf.copy_common_bytes_to_slice(&mut buf)? < 5 || buf != *b"write" { + return Err(Error::new(EINVAL)); + } + + let pipe = Self::get_pipe(key, token)?; + + if pipe.has_run_dup.swap(true, Ordering::SeqCst) { + return Err(Error::new(EBADF)); + } + + Ok(OpenResult::SchemeLocal( + key | WRITE_NOT_READ_BIT, + InternalFlags::empty(), + )) + } + fn kopenat( + &self, + id: usize, + user_buf: StrOrBytes, + _flags: usize, + _fcntl_flags: u32, + _ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + let (_, key) = from_raw_id(id); + + { + let guard = PIPES.read(token.token()); + if let Some(Handle::SchemeRoot) = guard.get(&key) { + } else if let Some(Handle::Pipe(pipe_arc)) = guard.get(&key) { + let pipe = Arc::clone(pipe_arc); + drop(guard); + + if user_buf.as_bytes() == b"write" { + return Err(Error::new(EINVAL)); + } + + if pipe.has_run_dup.swap(true, Ordering::SeqCst) { + return Err(Error::new(EBADF)); + } + + return Ok(OpenResult::SchemeLocal( + key | WRITE_NOT_READ_BIT, + InternalFlags::empty(), + )); + } else { + return Err(Error::new(EBADF)); + } + } + + let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?; + if !path.trim_start_matches('/').is_empty() { + return Err(Error::new(ENOENT)); + } + + let (read_id, _) = pipe(token)?; + + Ok(OpenResult::SchemeLocal(read_id, InternalFlags::empty())) + } + + fn kread( + &self, + id: usize, + user_buf: UserSliceWo, + fcntl_flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let (is_write_not_read, key) = from_raw_id(id); + + if is_write_not_read { + return Err(Error::new(EBADF)); + } + let pipe = Self::get_pipe(key, token)?; + + loop { + let vec = pipe.queue.lock(token.token()); + let (mut vec, mut token) = vec.into_split(); + + let (s1, s2) = vec.as_slices(); + let s1_count = core::cmp::min(user_buf.len(), s1.len()); + + let (s1_dst, s2_buf) = user_buf + .split_at(s1_count) + .expect("s1_count <= user_buf.len()"); + s1_dst.copy_from_slice(&s1[..s1_count])?; + + let s2_count = core::cmp::min(s2_buf.len(), s2.len()); + s2_buf + .limit(s2_count) + .expect("s2_count <= s2_buf.len()") + .copy_from_slice(&s2[..s2_count])?; + + let bytes_read = s1_count + s2_count; + let _ = vec.drain(..bytes_read); + + if bytes_read > 0 { + event::trigger_locked( + GlobalSchemes::Pipe.scheme_id(), + key | WRITE_NOT_READ_BIT, + EVENT_WRITE, + token.token(), + ); + pipe.write_condition.notify_locked(token.token()); + + return Ok(bytes_read); + } else if user_buf.is_empty() { + return Ok(0); + } + + if !pipe.writer_is_alive.load(Ordering::SeqCst) { + return Ok(0); + } else if fcntl_flags & O_NONBLOCK as u32 != 0 { + return Err(Error::new(EAGAIN)); + } else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) { + return Err(Error::new(EINTR)); + } + } + } + fn kwrite( + &self, + id: usize, + user_buf: UserSliceRo, + fcntl_flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let (is_write_not_read, key) = from_raw_id(id); + + if !is_write_not_read { + return Err(Error::new(EBADF)); + } + let pipe = Self::get_pipe(key, token)?; + + loop { + let vec = pipe.queue.lock(token.token()); + let (mut vec, mut token) = vec.into_split(); + + if !pipe.reader_is_alive.load(Ordering::Relaxed) { + return Err(Error::new(EPIPE)); + } + + let bytes_left = MAX_QUEUE_SIZE.saturating_sub(vec.len()); + let bytes_to_write = core::cmp::min(bytes_left, user_buf.len()); + let src_buf = user_buf + .limit(bytes_to_write) + .expect("bytes_to_write <= user_buf.len()"); + + const TMPBUF_SIZE: usize = 512; + let mut tmp_buf = [0_u8; TMPBUF_SIZE]; + + let mut bytes_written = 0; + + // TODO: Modify VecDeque so that the unwritten portions can be accessed directly? + for (idx, chunk) in src_buf.in_variable_chunks(TMPBUF_SIZE).enumerate() { + let chunk_byte_count = match chunk.copy_common_bytes_to_slice(&mut tmp_buf) { + Ok(c) => c, + Err(_) if idx > 0 => break, + Err(error) => return Err(error), + }; + vec.extend(&tmp_buf[..chunk_byte_count]); + bytes_written += chunk_byte_count; + } + + if bytes_written > 0 { + event::trigger_locked( + GlobalSchemes::Pipe.scheme_id(), + key, + EVENT_READ, + token.token(), + ); + pipe.read_condition.notify_locked(token.token()); + + return Ok(bytes_written); + } else if user_buf.is_empty() { + return Ok(0); + } + + if fcntl_flags & O_NONBLOCK as u32 != 0 { + return Err(Error::new(EAGAIN)); + } else if !pipe + .write_condition + .wait(vec, "PipeWrite::write", &mut token) + { + return Err(Error::new(EINTR)); + } + } + } + fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { + //TODO: construct useful path? + buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes()) + } + fn kfstat(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<()> { + buf.copy_exactly(&Stat { + st_mode: MODE_FIFO | 0o666, + ..Default::default() + })?; + + Ok(()) + } + fn kfdwrite( + &self, + id: usize, + mut descs: Vec>, + _flags: CallFlags, + _args: u64, + _metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + let (is_write_not_read, key) = from_raw_id(id); + + if !is_write_not_read { + return Err(Error::new(EBADF)); + } + let pipe = match Self::get_pipe(key, token) { + Ok(p) => p, + Err(e) => { + return Err(e); + } + }; + + loop { + let vec = pipe.fd_queue.lock(token.token()); + let (mut vec, mut token) = vec.into_split(); + + if !pipe.reader_is_alive.load(Ordering::Relaxed) { + return Err(Error::new(EPIPE)); + } + if descs.is_empty() { + return Ok(0); + } + + let before_len = vec.len(); + + for desc in descs.drain(..) { + if vec.len() < crate::context::CONTEXT_MAX_FILES { + vec.push_back(desc); + } else { + break; + } + } + + let fds_written = vec.len() - before_len; + + if fds_written > 0 { + event::trigger_locked( + GlobalSchemes::Pipe.scheme_id(), + key, + EVENT_READ, + token.token(), + ); + pipe.read_condition.notify_locked(token.token()); + + return Ok(fds_written); + } + + if !pipe + .write_condition + .wait(vec, "PipeWrite::write", &mut token) + { + return Err(Error::new(EINTR)); + } + } + } + fn kfdread( + &self, + id: usize, + payload: UserSliceRw, + flags: CallFlags, + _metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + let (is_write_not_read, key) = from_raw_id(id); + + if is_write_not_read { + return Err(Error::new(EBADF)); + } + let pipe = match Self::get_pipe(key, token) { + Ok(p) => p, + Err(e) => { + return Err(e); + } + }; + + if payload.is_empty() { + return Ok(0); + } + + loop { + let vec = pipe.fd_queue.lock(token.token()); + let (mut vec, mut token) = vec.into_split(); + + let fds_available = vec.len(); + let max_fds_read = payload.len() / size_of::(); + let fds_to_read = core::cmp::min(fds_available, max_fds_read); + if fds_to_read > 0 { + let fds_to_transfer: Vec<_> = vec.drain(..fds_to_read).collect(); + + if flags.contains(CallFlags::FD_UPPER) { + bulk_insert_fds( + fds_to_transfer, + payload, + flags.contains(CallFlags::FD_CLOEXEC), + &mut token, + )?; + } else { + bulk_add_fds( + fds_to_transfer, + payload, + flags.contains(CallFlags::FD_CLOEXEC), + &mut token, + )?; + } + + event::trigger_locked( + GlobalSchemes::Pipe.scheme_id(), + key | WRITE_NOT_READ_BIT, + EVENT_WRITE, + token.token(), + ); + pipe.write_condition.notify_locked(token.token()); + + return Ok(fds_to_read); + } + + if !pipe.writer_is_alive.load(Ordering::SeqCst) { + return Ok(0); + } else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) { + return Err(Error::new(EINTR)); + } + } + } +} + +pub struct Pipe { + read_condition: WaitCondition, // signals whether there are available bytes to read + write_condition: WaitCondition, // signals whether there is room for additional bytes + queue: Mutex>, + reader_is_alive: AtomicBool, // starts set, unset when reader closes + writer_is_alive: AtomicBool, // starts set, unset when writer closes + has_run_dup: AtomicBool, + fd_queue: Mutex>>, +} diff --git a/src/scheme/proc.rs b/src/scheme/proc.rs new file mode 100644 index 0000000000..6ffb256cad --- /dev/null +++ b/src/scheme/proc.rs @@ -0,0 +1,1570 @@ +use crate::{ + context::{ + self, + context::{HardBlockedReason, LockedFdTbl, SignalState}, + file::InternalFlags, + memory::{handle_notify_files, AddrSpace, AddrSpaceWrapper, Grant, PageSpan}, + Context, ContextLock, Status, + }, + memory::{Page, VirtualAddress, PAGE_SIZE}, + ptrace, + scheme::{self, memory::MemoryScheme, FileHandle, KernelScheme}, + sync::{CleanLockToken, LockToken, RwLock, L1, L4}, + syscall::{ + data::{GrantDesc, Map, SetSighandlerData, Stat}, + error::*, + flag::*, + usercopy::{UserSliceRo, UserSliceRw, UserSliceWo}, + EnvRegisters, FloatRegisters, IntRegisters, + }, +}; + +use super::{CallerCtx, KernelSchemes, OpenResult}; +use ::syscall::{ProcSchemeAttrs, SigProcControl, Sigcontrol}; +use alloc::{ + boxed::Box, + string::String, + sync::{Arc, Weak}, + vec::Vec, +}; +use core::{ + mem::size_of, + num::NonZeroUsize, + slice, str, + sync::atomic::{AtomicUsize, Ordering}, +}; +use hashbrown::{ + hash_map::{DefaultHashBuilder, Entry}, + HashMap, +}; +use syscall::data::GlobalSchemes; + +fn read_from(dst: UserSliceWo, src: &[u8], offset: u64) -> Result { + let avail_src = usize::try_from(offset) + .ok() + .and_then(|o| src.get(o..)) + .unwrap_or(&[]); + dst.copy_common_bytes_from_slice(avail_src) +} + +fn try_stop_context( + context_ref: Arc, + token: &mut CleanLockToken, + callback: impl FnOnce(&mut Context, LockToken<'_, L4>) -> Result, +) -> Result { + if context::is_current(&context_ref) { + let context = &mut context_ref.write(token.token()); + let (context, token) = context.token_split(); + return callback(context, token); + } + // Stop process + let (prev_status, mut running) = { + let mut context = context_ref.write(token.token()); + + ( + core::mem::replace( + &mut context.status, + context::Status::HardBlocked { + reason: HardBlockedReason::NotYetStarted, + }, + ), + context.running, + ) + }; + + // Wait until stopped + while running { + context::switch(token); + + running = context_ref.read(token.token()).running; + } + + let mut context = context_ref.write(token.token()); + assert!( + !context.running, + "process can't have been restarted, we stopped it!" + ); + + let (context, token) = context.token_split(); + let ret = callback(context, token); + + context.status = prev_status; + + ret +} + +#[derive(Clone, Copy, PartialEq, Eq)] +enum RegsKind { + Float, + Int, + Env, +} +#[derive(Clone)] +enum ContextHandle { + // Opened by the process manager, after which it is locked. This capability is used to open + // Attr handles, to set ens/euid/egid/pid. + Authority, + Attr, + Groups, + + Status { + privileged: bool, + }, // can write ContextVerb + + Regs(RegsKind), + Sighandler, + Start, + NewFiletable { + filetable: Arc, + binary_format: bool, + data: Box<[u8]>, + }, + Filetable { + filetable: Weak, + binary_format: bool, + data: Box<[u8]>, + }, + AddrSpace { + addrspace: Arc, + }, + CurrentAddrSpace, + + AwaitingAddrSpaceChange { + new: Arc, + new_sp: usize, + new_ip: usize, + arg1: Option, + }, + + CurrentFiletable, + + AwaitingFiletableChange { + new_ft: Arc, + }, + + // TODO: Remove this once openat is implemented, or allow openat-via-dup via e.g. the top-level + // directory. + OpenViaDup, + SchedAffinity, + + MmapMinAddr(Arc), +} +#[derive(Clone)] +struct Handle { + context: Arc, + kind: ContextHandle, +} +pub struct ProcScheme; + +static NEXT_ID: AtomicUsize = AtomicUsize::new(1); +static HANDLES: RwLock> = + RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); + +#[cfg(feature = "debugger")] +#[allow(dead_code)] +pub fn foreach_addrsp( + token: &mut CleanLockToken, + mut f: impl FnMut(&Arc, LockToken), +) { + let mut handles_guard = HANDLES.read(token.token()); + let (handles, mut token) = handles_guard.token_split(); + for (_, handle) in handles.iter() { + let Handle { + kind: + ContextHandle::AddrSpace { addrspace, .. } + | ContextHandle::AwaitingAddrSpaceChange { new: addrspace, .. } + | ContextHandle::MmapMinAddr(addrspace), + .. + } = handle + else { + continue; + }; + f(&addrspace, token.token()); + } +} + +fn new_handle( + (handle, fl): (Handle, InternalFlags), + token: &mut CleanLockToken, +) -> Result<(usize, InternalFlags)> { + let id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + let _ = HANDLES.write(token.token()).insert(id, handle); + Ok((id, fl)) +} + +enum OpenTy { + Ctxt(Arc), + Auth, +} + +impl ProcScheme { + fn openat_context( + &self, + path: &str, + context: Arc, + token: &mut CleanLockToken, + ) -> Result> { + Ok(Some(match path { + "addrspace" => ( + ContextHandle::AddrSpace { + addrspace: Arc::clone( + context + .read(token.token()) + .addr_space() + .map_err(|_| Error::new(ENOENT))?, + ), + }, + true, + ), + "filetable" => ( + ContextHandle::Filetable { + filetable: Arc::downgrade(&context.read(token.token()).files), + binary_format: false, + data: Box::new([]), + }, + true, + ), + "filetable-binary" => ( + ContextHandle::Filetable { + filetable: Arc::downgrade(&context.read(token.token()).files), + binary_format: true, + data: Box::new([]), + }, + true, + ), + "current-addrspace" => (ContextHandle::CurrentAddrSpace, false), + "current-filetable" => (ContextHandle::CurrentFiletable, false), + "regs/float" => (ContextHandle::Regs(RegsKind::Float), false), + "regs/int" => (ContextHandle::Regs(RegsKind::Int), false), + "regs/env" => (ContextHandle::Regs(RegsKind::Env), false), + "sighandler" => (ContextHandle::Sighandler, false), + "start" => (ContextHandle::Start, false), + "open_via_dup" => (ContextHandle::OpenViaDup, false), + "mmap-min-addr" => ( + ContextHandle::MmapMinAddr(Arc::clone( + context + .read(token.token()) + .addr_space() + .map_err(|_| Error::new(ENOENT))?, + )), + false, + ), + "sched-affinity" => (ContextHandle::SchedAffinity, true), + "status" => (ContextHandle::Status { privileged: false }, false), + _ if path.starts_with("auth-") => { + let nonprefix = &path["auth-".len()..]; + let next_dash = nonprefix.find('-').ok_or(Error::new(ENOENT))?; + let auth_fd = nonprefix[..next_dash] + .parse::() + .map_err(|_| Error::new(ENOENT))?; + let actual_name = &nonprefix[next_dash + 1..]; + + let handle = match actual_name { + "attrs" => ContextHandle::Attr, + "status" => ContextHandle::Status { privileged: true }, + "groups" => ContextHandle::Groups, + _ => return Err(Error::new(ENOENT)), + }; + + let (hopefully_this_scheme, number) = extract_scheme_number(auth_fd, token)?; + verify_scheme(hopefully_this_scheme)?; + if !matches!( + HANDLES + .read(token.token()) + .get(&number) + .ok_or(Error::new(ENOENT))? + .kind, + ContextHandle::Authority + ) { + return Err(Error::new(ENOENT)); + } + + (handle, false) + } + _ => return Ok(None), + })) + } + fn open_inner( + &self, + ty: OpenTy, + operation_str: Option<&str>, + _flags: usize, + token: &mut CleanLockToken, + ) -> Result<(usize, InternalFlags)> { + let operation_name = operation_str.ok_or(Error::new(EINVAL))?; + let (mut handle, positioned) = match ty { + OpenTy::Ctxt(context) => { + match self.openat_context(operation_name, Arc::clone(&context), token)? { + Some((kind, positioned)) => (Handle { context, kind }, positioned), + _ => { + return Err(Error::new(EINVAL)); + } + } + } + OpenTy::Auth => { + extern "C" fn ret() {} + let context = match operation_str.ok_or(Error::new(ENOENT))? { + "new-context" => { + let id = NonZeroUsize::new(NEXT_ID.fetch_add(1, Ordering::Relaxed)) + .ok_or(Error::new(EMFILE))?; + let context = context::spawn(true, Some(id), ret, token)?; + { + let parent_groups = + context::current().read(token.token()).groups.clone(); + context.write(token.token()).groups = parent_groups; + } + HANDLES.write(token.token()).insert( + id.get(), + Handle { + context, + kind: ContextHandle::OpenViaDup, + }, + ); + return Ok((id.get(), InternalFlags::empty())); + } + "cur-context" => context::current(), + _ => return Err(Error::new(ENOENT)), + }; + + ( + Handle { + context, + kind: ContextHandle::OpenViaDup, + }, + false, + ) + } + }; + + { + let filetable_opt = match handle { + Handle { + kind: + ContextHandle::Filetable { + ref filetable, + binary_format, + ref mut data, + }, + .. + } => Some(( + filetable.upgrade().ok_or(Error::new(EOWNERDEAD))?, + binary_format, + data, + )), + Handle { + kind: + ContextHandle::NewFiletable { + ref filetable, + binary_format, + ref mut data, + }, + .. + } => Some((Arc::clone(filetable), binary_format, data)), + _ => None, + }; + if let Some((filetable, binary_format, data)) = filetable_opt { + *data = if binary_format { + let mut data = Vec::new(); + for index in filetable + .read(token.token()) + .enumerate() + .filter_map(|(idx, val)| val.as_ref().map(|_| idx)) + { + data.extend((index as u64).to_le_bytes()); + } + data.into_boxed_slice() + } else { + use core::fmt::Write; + + let mut data = String::new(); + for index in filetable + .read(token.token()) + .enumerate() + .filter_map(|(idx, val)| val.as_ref().map(|_| idx)) + { + writeln!(data, "{}", index).unwrap(); + } + data.into_bytes().into_boxed_slice() + }; + } + }; + + let (id, int_fl) = new_handle( + ( + handle.clone(), + if positioned { + InternalFlags::POSITIONED + } else { + InternalFlags::empty() + }, + ), + token, + )?; + + Ok((id, int_fl)) + } +} + +impl KernelScheme for ProcScheme { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + let id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + HANDLES.write(token.token()).insert( + id, + Handle { + // TODO: placeholder + context: context::current(), + kind: ContextHandle::Authority, + }, + ); + Ok(id) + } + + fn fevent( + &self, + id: usize, + _flags: EventFlags, + token: &mut CleanLockToken, + ) -> Result { + let handles = HANDLES.read(token.token()); + let _handle = handles.get(&id).ok_or(Error::new(EBADF))?; + + Ok(EventFlags::empty()) + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + let handle = HANDLES + .write(token.token()) + .remove(&id) + .ok_or(Error::new(EBADF))?; + + match handle { + Handle { + context, + kind: + ContextHandle::AwaitingAddrSpaceChange { + new, + new_sp, + new_ip, + arg1, + }, + } => { + let old_ctx = try_stop_context(context, token, |context, _| { + let regs = context.regs_mut().ok_or(Error::new(EBADFD))?; + regs.set_instr_pointer(new_ip); + regs.set_stack_pointer(new_sp); + #[cfg(any( + target_arch = "x86_64", + target_arch = "aarch64", + target_arch = "riscv64" + ))] + regs.set_arg1(arg1); + + // TODO: Lock ordering violation + let mut token = unsafe { CleanLockToken::new() }; + Ok(context.set_addr_space(Some(new), token.downgrade())) + })?; + if let Some(old_ctx) = old_ctx + && let Some(addrspace) = Arc::into_inner(old_ctx) + { + addrspace.into_drop(token); + } + let _ = ptrace::send_event( + crate::syscall::ptrace_event!(PTRACE_EVENT_ADDRSPACE_SWITCH, 0), + token, + ); + } + Handle { + kind: ContextHandle::AddrSpace { addrspace } | ContextHandle::MmapMinAddr(addrspace), + .. + } => { + if let Some(addrspace) = Arc::into_inner(addrspace) { + addrspace.into_drop(token); + } + } + + Handle { + kind: ContextHandle::AwaitingFiletableChange { new_ft }, + context, + } => { + context.write(token.token()).files = new_ft; + } + _ => (), + } + Ok(()) + } + fn kfmap( + &self, + id: usize, + dst_addr_space: &Arc, + map: &crate::syscall::data::Map, + consume: bool, + token: &mut CleanLockToken, + ) -> Result { + let handle = HANDLES + .read(token.token()) + .get(&id) + .ok_or(Error::new(EBADF))? + .clone(); + let Handle { kind, ref context } = handle; + + match kind { + ContextHandle::AddrSpace { ref addrspace } => { + if Arc::ptr_eq(addrspace, dst_addr_space) { + return Err(Error::new(EBUSY)); + } + + let PageSpan { + base: requested_dst_page, + .. + } = crate::syscall::validate_region(map.address, map.size)?; + let src_span = + PageSpan::validate_nonempty(VirtualAddress::new(map.offset), map.size) + .ok_or(Error::new(EINVAL))?; + + let fixed = map.flags.contains(MapFlags::MAP_FIXED) + || map.flags.contains(MapFlags::MAP_FIXED_NOREPLACE); + let requested_dst_base = (map.address != 0 || fixed).then_some(requested_dst_page); + + let mut src_addr_space_guard = addrspace.acquire_write(token.downgrade()); + let (src_addr_space, lock_token) = src_addr_space_guard.token_split(); + + let src_page_count = NonZeroUsize::new(src_span.count).ok_or(Error::new(EINVAL))?; + + let mut notify_files = Vec::new(); + + // TODO: Validate flags + let result_base = if consume { + dst_addr_space.r#move( + Some((addrspace, &mut *src_addr_space)), + src_span, + requested_dst_base, + src_page_count.get(), + map.flags, + Some(&mut notify_files), + lock_token, + )? + } else { + // SAFETY: We've compared Arc::ptr_eq(addrspace, dst_addr_space) before + let mut dst_addrsp_guard = + unsafe { dst_addr_space.acquire_rewrite(lock_token) }; + dst_addrsp_guard.mmap( + dst_addr_space, + requested_dst_base, + src_page_count, + map.flags, + Some(&mut notify_files), + |dst_page, _, dst_mapper, flusher| { + Grant::borrow( + Arc::clone(addrspace), + src_addr_space, + src_span.base, + dst_page, + src_span.count, + map.flags, + dst_mapper, + flusher, + true, + true, + false, + ) + }, + )? + }; + + drop(src_addr_space_guard); + + handle_notify_files(notify_files, token); + + Ok(result_base.start_address().data()) + } + ContextHandle::Sighandler => { + let context = context.read(token.token()); + // let (context, token) = context.token_split(); + let sig = context.sig.as_ref().ok_or(Error::new(EBADF))?; + let frame = match map.offset { + // tctl + 0 => &sig.thread_control, + // pctl + PAGE_SIZE => &sig.proc_control, + _ => return Err(Error::new(EINVAL)), + }; + // TODO: Allocated or AllocatedShared? + let addrsp = AddrSpace::current()?; + // TODO: Lock ordering violation + let mut token = unsafe { CleanLockToken::new() }; + let page = addrsp.acquire_write(token.downgrade()).mmap_anywhere( + &addrsp, + NonZeroUsize::new(1).unwrap(), + MapFlags::PROT_READ | MapFlags::PROT_WRITE, + |page, flags, mapper, flusher| { + Grant::allocated_shared_one_page( + frame.get(), + page, + flags, + mapper, + flusher, + false, + ) + }, + )?; + Ok(page.start_address().data()) + } + _ => Err(Error::new(EBADF)), + } + } + fn kreadoff( + &self, + id: usize, + buf: UserSliceWo, + offset: u64, + _read_flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + // Don't hold a global lock during the context switch later on + let handle = { + let handles = HANDLES.read(token.token()); + handles.get(&id).ok_or(Error::new(EBADF))?.clone() + }; + + let Handle { context, kind } = handle; + kind.kreadoff(id, context, buf, offset, token) + } + fn kcall( + &self, + id: usize, + _payload: UserSliceRw, + _flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + // TODO: simplify + let handle = { + let mut handles = HANDLES.write(token.token()); + let handle = handles.get_mut(&id).ok_or(Error::new(EBADF))?; + handle.clone() + }; + + let ContextHandle::OpenViaDup = handle.kind else { + return Err(Error::new(EBADF)); + }; + + let verb: u8 = (*metadata.first().ok_or(Error::new(EINVAL))?) + .try_into() + .map_err(|_| Error::new(EINVAL))?; + let verb = ProcSchemeVerb::try_from_raw(verb).ok_or(Error::new(EINVAL))?; + + match verb { + ProcSchemeVerb::Iopl => context::current() + .write(token.token()) + .set_userspace_io_allowed(true), + } + Ok(0) + } + fn kwriteoff( + &self, + id: usize, + buf: UserSliceRo, + _offset: u64, + _fcntl_flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + // TODO: offset + + // Don't hold a global lock during the context switch later on + let handle = { + let mut handles = HANDLES.write(token.token()); + let handle = handles.get_mut(&id).ok_or(Error::new(EBADF))?; + handle.clone() + }; + + let Handle { context, kind } = handle; + kind.kwriteoff(id, context, buf, token) + } + + fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { + //TODO: construct useful path? + buf.copy_common_bytes_from_slice("/scheme/kernel.proc/".as_bytes()) + } + + fn kfstat(&self, id: usize, buffer: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + let handles = HANDLES.read(token.token()); + let handle = handles.get(&id).ok_or(Error::new(EBADF))?; + + buffer.copy_exactly(&Stat { + st_mode: MODE_FILE | 0o666, + st_size: handle.fsize()?, + + ..Stat::default() + })?; + + Ok(()) + } + + fn fsize(&self, id: usize, token: &mut CleanLockToken) -> Result { + let mut handles = HANDLES.write(token.token()); + let handle = handles.get_mut(&id).ok_or(Error::new(EBADF))?; + + handle.fsize() + } + + /// Dup is currently used to implement clone() and execve(). + fn kdup( + &self, + old_id: usize, + raw_buf: UserSliceRo, + _: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + let info = { + let handles = HANDLES.read(token.token()); + let handle = handles.get(&old_id).ok_or(Error::new(EBADF))?; + + handle.clone() + }; + + let handle = |h, positioned| { + ( + h, + if positioned { + InternalFlags::POSITIONED + } else { + InternalFlags::empty() + }, + ) + }; + let mut array = [0_u8; 64]; + if raw_buf.len() > array.len() { + return Err(Error::new(EINVAL)); + } + raw_buf.copy_to_slice(&mut array[..raw_buf.len()])?; + let buf = &array[..raw_buf.len()]; + + new_handle( + match info { + Handle { + kind: ContextHandle::Authority, + .. + } => { + return self + .open_inner( + OpenTy::Auth, + Some(core::str::from_utf8(buf).map_err(|_| Error::new(EINVAL))?) + .filter(|s| !s.is_empty()), + O_RDWR | O_CLOEXEC, + token, + ) + .map(|(r, fl)| OpenResult::SchemeLocal(r, fl)) + } + Handle { + kind: ContextHandle::OpenViaDup, + context, + } => { + return self + .open_inner( + OpenTy::Ctxt(context), + Some(core::str::from_utf8(buf).map_err(|_| Error::new(EINVAL))?) + .filter(|s| !s.is_empty()), + O_RDWR | O_CLOEXEC, + token, + ) + .map(|(r, fl)| OpenResult::SchemeLocal(r, fl)); + } + + Handle { + kind: + ContextHandle::Filetable { + ref filetable, + binary_format, + ref data, + }, + context, + } => { + // TODO: Maybe allow userspace to either copy or transfer recently dupped file + // descriptors between file tables. + if buf != b"copy" { + return Err(Error::new(EINVAL)); + } + let filetable = filetable.upgrade().ok_or(Error::new(EOWNERDEAD))?; + + let new_filetable = + Arc::new(RwLock::new(filetable.read(token.token()).clone())); + + handle( + Handle { + kind: ContextHandle::NewFiletable { + filetable: new_filetable, + binary_format, + data: data.clone(), + }, + context, + }, + true, + ) + } + Handle { + kind: ContextHandle::AddrSpace { ref addrspace }, + context, + } => { + const GRANT_FD_PREFIX: &[u8] = b"grant-fd-"; + + let kind = match buf { + // TODO: Better way to obtain new empty address spaces, perhaps using SYS_OPEN. But + // in that case, what scheme? + b"empty" => ContextHandle::AddrSpace { + addrspace: AddrSpaceWrapper::new()?, + }, + b"exclusive" => ContextHandle::AddrSpace { + addrspace: addrspace.try_clone(token)?, + }, + b"mmap-min-addr" => ContextHandle::MmapMinAddr(Arc::clone(addrspace)), + + _ if buf.starts_with(GRANT_FD_PREFIX) => { + let string = core::str::from_utf8(&buf[GRANT_FD_PREFIX.len()..]) + .map_err(|_| Error::new(EINVAL))?; + let page_addr = usize::from_str_radix(string, 16) + .map_err(|_| Error::new(EINVAL))?; + + if page_addr % PAGE_SIZE != 0 { + return Err(Error::new(EINVAL)); + } + + let page = Page::containing_address(VirtualAddress::new(page_addr)); + + let mut token = token.token(); + let read_lock = addrspace.acquire_read(token.downgrade()); + let (_, info) = + read_lock.grants.contains(page).ok_or(Error::new(EINVAL))?; + return Ok(OpenResult::External( + info.file_ref() + .map(|r| Arc::clone(&r.description)) + .ok_or(Error::new(EBADF))?, + )); + } + + _ => return Err(Error::new(EINVAL)), + }; + + handle(Handle { context, kind }, true) + } + _ => return Err(Error::new(EINVAL)), + }, + token, + ) + .map(|(r, fl)| OpenResult::SchemeLocal(r, fl)) + } +} +fn extract_scheme_number(fd: usize, token: &mut CleanLockToken) -> Result<(KernelSchemes, usize)> { + let (scheme_id, number) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + let file_descriptor = context + .get_file(FileHandle::from(fd), &mut token) + .ok_or(Error::new(EBADF))?; + let desc = file_descriptor.description.read(token.token()); + (desc.scheme, desc.number) + }; + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + + Ok((scheme, number)) +} +fn verify_scheme(scheme: KernelSchemes) -> Result<()> { + if !matches!(scheme, KernelSchemes::Global(GlobalSchemes::Proc)) { + return Err(Error::new(EBADF)); + } + Ok(()) +} +impl Handle { + fn fsize(&self) -> Result { + match self.kind { + ContextHandle::Filetable { ref data, .. } + | ContextHandle::NewFiletable { ref data, .. } => Ok(data.len() as u64), + _ => Ok(0), + } + } +} +impl ContextHandle { + fn kwriteoff( + self, + id: usize, + context: Arc, + buf: UserSliceRo, + token: &mut CleanLockToken, + ) -> Result { + match self { + Self::AddrSpace { addrspace } => { + let mut chunks = buf.usizes(); + let mut words_read = 0; + let mut next = || { + words_read += 1; + chunks.next().ok_or(Error::new(EINVAL)) + }; + + match next()?? { + op @ ADDRSPACE_OP_MMAP | op @ ADDRSPACE_OP_TRANSFER => { + let fd = next()??; + let offset = next()??; + let page_span = crate::syscall::validate_region(next()??, next()??)?; + let flags = MapFlags::from_bits(next()??).ok_or(Error::new(EINVAL))?; + + if fd == !0 { + if op == ADDRSPACE_OP_TRANSFER { + return Err(Error::new(EOPNOTSUPP)); + } + + return MemoryScheme::fmap_anonymous( + &addrspace, + &Map { + offset, + size: page_span.count * PAGE_SIZE, + address: page_span.base.start_address().data(), + flags, + }, + false, + token, + ); + } else { + let (scheme, number) = extract_scheme_number(fd, token)?; + + // ADDRSPACE_OP_MMAP and ADDRSPACE_OP_TRANSFER return the target address + // rather than the amount of written bytes. + // FIXME maybe make all these operations calls rather than writes? + return scheme.kfmap( + number, + &addrspace, + &Map { + offset, + size: page_span.count * PAGE_SIZE, + address: page_span.base.start_address().data(), + flags, + }, + op == ADDRSPACE_OP_TRANSFER, + token, + ); + } + } + ADDRSPACE_OP_MUNMAP => { + let page_span = crate::syscall::validate_region(next()??, next()??)?; + + let unpin = false; + let res = addrspace.munmap(page_span, unpin, token)?; + for r in res { + let _ = r.unmap(token); + } + } + ADDRSPACE_OP_MPROTECT => { + let page_span = crate::syscall::validate_region(next()??, next()??)?; + let flags = MapFlags::from_bits(next()??).ok_or(Error::new(EINVAL))?; + + addrspace.mprotect(page_span, flags, token)?; + } + _ => return Err(Error::new(EINVAL)), + } + Ok(words_read * size_of::()) + } + ContextHandle::Regs(kind) => match kind { + RegsKind::Float => { + let regs = unsafe { buf.read_exact::()? }; + + try_stop_context(context, token, |context, _| { + // NOTE: The kernel will never touch floats + + // Ignore the rare case of floating point + // registers being uninitiated + context.set_fx_regs(regs); + + Ok(size_of::()) + }) + } + RegsKind::Int => { + let regs = unsafe { buf.read_exact::()? }; + + try_stop_context(context, token, |context, _| match context.regs_mut() { + None => { + println!( + "{}:{}: Couldn't read registers from stopped process", + file!(), + line!() + ); + Err(Error::new(ENOTRECOVERABLE)) + } + Some(stack) => { + stack.load(®s); + + Ok(size_of::()) + } + }) + } + RegsKind::Env => { + let regs = unsafe { buf.read_exact::()? }; + write_env_regs(context, regs, token)?; + Ok(size_of::()) + } + }, + ContextHandle::Sighandler => { + let data = unsafe { buf.read_exact::()? }; + + if data.user_handler >= crate::USER_END_OFFSET + || data.excp_handler >= crate::USER_END_OFFSET + { + return Err(Error::new(EPERM)); + } + if data.thread_control_addr >= crate::USER_END_OFFSET + || data.proc_control_addr >= crate::USER_END_OFFSET + { + return Err(Error::new(EFAULT)); + } + + let state = if data.thread_control_addr != 0 && data.proc_control_addr != 0 { + let validate_off = |addr, sz| { + let off: usize = addr % PAGE_SIZE; + if off.is_multiple_of(align_of::()) && off + sz <= PAGE_SIZE { + Ok(off as u16) + } else { + Err(Error::new(EINVAL)) + } + }; + + let addrsp = Arc::clone(context.read(token.token()).addr_space()?); + + Some(SignalState { + threadctl_off: validate_off( + data.thread_control_addr, + size_of::(), + )?, + procctl_off: validate_off( + data.proc_control_addr, + size_of::(), + )?, + user_handler: NonZeroUsize::new(data.user_handler) + .ok_or(Error::new(EINVAL))?, + excp_handler: NonZeroUsize::new(data.excp_handler), + thread_control: addrsp.borrow_frame_enforce_rw_allocated( + Page::containing_address(VirtualAddress::new(data.thread_control_addr)), + token, + )?, + proc_control: addrsp.borrow_frame_enforce_rw_allocated( + Page::containing_address(VirtualAddress::new(data.proc_control_addr)), + token, + )?, + }) + } else { + None + }; + + context.write(token.token()).sig = state; + + Ok(size_of::()) + } + ContextHandle::Start => match context.write(token.token()).status { + ref mut status @ Status::HardBlocked { + reason: HardBlockedReason::NotYetStarted, + } => { + *status = Status::Runnable; + Ok(buf.len()) + } + _ => Err(Error::new(EINVAL)), + }, + ContextHandle::Filetable { .. } | ContextHandle::NewFiletable { .. } => { + Err(Error::new(EBADF)) + } + + ContextHandle::CurrentFiletable => { + let filetable_fd = buf.read_usize()?; + let (hopefully_this_scheme, number) = extract_scheme_number(filetable_fd, token)?; + verify_scheme(hopefully_this_scheme)?; + + let mut handles = HANDLES.write(token.token()); + let Entry::Occupied(mut entry) = handles.entry(number) else { + return Err(Error::new(EBADF)); + }; + let filetable = match *entry.get_mut() { + Handle { + kind: ContextHandle::Filetable { ref filetable, .. }, + .. + } => filetable.upgrade().ok_or(Error::new(EOWNERDEAD))?, + Handle { + kind: + ContextHandle::NewFiletable { + ref filetable, + binary_format, + ref data, + }, + .. + } => { + let ft = Arc::clone(filetable); + *entry.get_mut() = Handle { + kind: ContextHandle::Filetable { + filetable: Arc::downgrade(filetable), + binary_format, + data: data.clone(), + }, + context: Arc::clone(&context), + }; + ft + } + + _ => return Err(Error::new(EBADF)), + }; + + *handles.get_mut(&id).ok_or(Error::new(EBADF))? = Handle { + kind: ContextHandle::AwaitingFiletableChange { new_ft: filetable }, + context, + }; + + Ok(size_of::()) + } + ContextHandle::CurrentAddrSpace => { + let mut iter = buf.usizes(); + let addrspace_fd = iter.next().ok_or(Error::new(EINVAL))??; + let sp = iter.next().ok_or(Error::new(EINVAL))??; + let ip = iter.next().ok_or(Error::new(EINVAL))??; + let arg1 = iter.next().transpose()?; + + let (hopefully_this_scheme, number) = extract_scheme_number(addrspace_fd, token)?; + verify_scheme(hopefully_this_scheme)?; + + let mut handles = HANDLES.write(token.token()); + let &Handle { + kind: ContextHandle::AddrSpace { ref addrspace }, + .. + } = handles.get(&number).ok_or(Error::new(EBADF))? + else { + return Err(Error::new(EBADF)); + }; + + *handles.get_mut(&id).ok_or(Error::new(EBADF))? = Handle { + context, + kind: Self::AwaitingAddrSpaceChange { + new: Arc::clone(addrspace), + new_sp: sp, + new_ip: ip, + arg1, + }, + }; + + let written = if arg1.is_some() { + 4 * size_of::() + } else { + 3 * size_of::() + }; + + Ok(written) + } + Self::MmapMinAddr(ref addrspace) => { + let val = buf.read_usize()?; + if val % PAGE_SIZE != 0 || val > crate::USER_END_OFFSET { + return Err(Error::new(EINVAL)); + } + let mut lock_token = token.token(); + addrspace.acquire_write(lock_token.downgrade()).mmap_min = val; + Ok(size_of::()) + } + Self::SchedAffinity => { + let mask = unsafe { buf.read_exact::()? }; + + context + .write(token.token()) + .sched_affinity + .override_from(&mask); + + Ok(size_of_val(&mask)) + } + ContextHandle::Status { privileged } => { + let mut args = buf.usizes(); + + let user_data = args.next().ok_or(Error::new(EINVAL))??; + + let context_verb = + ContextVerb::try_from_raw(user_data).ok_or(Error::new(EINVAL))?; + + match context_verb { + // TODO: lwp_park/lwp_unpark for bypassing procmgr? + ContextVerb::Unstop | ContextVerb::Stop if !privileged => { + Err(Error::new(EPERM)) + } + ContextVerb::Stop => { + let mut guard = context.write(token.token()); + + match guard.status { + Status::Dead { .. } => return Err(Error::new(EOWNERDEAD)), + Status::HardBlocked { + reason: HardBlockedReason::AwaitingMmap { .. }, + } => todo!(), + _ => (), + } + guard.status = Status::HardBlocked { + reason: HardBlockedReason::Stopped, + }; + // TODO: wait for context to be switched away from, and/or IPI? + Ok(size_of::()) + } + ContextVerb::Unstop => { + let mut guard = context.write(token.token()); + + if let Status::HardBlocked { + reason: HardBlockedReason::Stopped, + } = guard.status + { + guard.status = Status::Runnable; + } + Ok(size_of::()) + } + ContextVerb::Interrupt => { + let mut guard = context.write(token.token()); + guard.unblock(); + Ok(size_of::()) + } + ContextVerb::ForceKill => { + if context::is_current(&context) { + //trace!("FORCEKILL SELF {} {}", context.read().debug_id, context.read().pid); + // The following functionality simplifies the cleanup step when detached threads + // terminate. + if let Some(post_unmap) = args.next() { + let base = post_unmap?; + let size = args.next().ok_or(Error::new(EINVAL))??; + + if size > 0 { + let addrsp = + Arc::clone(context.read(token.token()).addr_space()?); + let res = addrsp.munmap( + PageSpan::validate_nonempty( + VirtualAddress::new(base), + size, + ) + .ok_or(Error::new(EINVAL))?, + false, + token, + )?; + for r in res { + let _ = r.unmap(token); + } + } + } + crate::syscall::exit_this_context(None, token); + } else { + let mut ctxt = context.write(token.token()); + //trace!("FORCEKILL NONSELF={} {}, SELF={}", ctxt.debug_id, ctxt.pid, context::current().read().debug_id); + if let context::Status::Dead { .. } = ctxt.status { + return Ok(size_of::()); + } + ctxt.status = context::Status::Runnable; + ctxt.being_sigkilled = true; + Ok(size_of::()) + } + } + } + } + ContextHandle::Attr => { + let info = unsafe { buf.read_exact::()? }; + let mut guard = context.write(token.token()); + + let len = info + .debug_name + .iter() + .position(|c| *c == 0) + .unwrap_or(info.debug_name.len()) + .min(guard.name.capacity()); + let debug_name = core::str::from_utf8(&info.debug_name[..len]) + .map_err(|_| Error::new(EINVAL))?; + guard.name.clear(); + guard.name.push_str(debug_name); + + guard.pid = info.pid as usize; + guard.euid = info.euid; + guard.egid = info.egid; + guard.prio = (info.prio as usize).min(39); + Ok(size_of::()) + } + Self::Groups => { + const NGROUPS_MAX: usize = 65536; + if buf.len() % size_of::() != 0 { + return Err(Error::new(EINVAL)); + } + let count = buf.len() / size_of::(); + if count > NGROUPS_MAX { + return Err(Error::new(EINVAL)); + } + let mut groups = Vec::with_capacity(count); + for chunk in buf.in_exact_chunks(size_of::()).take(count) { + groups.push(chunk.read_u32()?); + } + let proc_id = { + let guard = context.read(token.token()); + guard.owner_proc_id + }; + { + let mut guard = context.write(token.token()); + guard.groups = groups.clone(); + } + if let Some(pid) = proc_id { + let mut contexts = context::contexts(token.downgrade()); + let (contexts, mut t) = contexts.token_split(); + for context_ref in contexts.iter() { + let mut ctx = context_ref.write(t.token()); + if ctx.owner_proc_id == Some(pid) { + ctx.groups = groups.clone(); + } + } + } + Ok(count * size_of::()) + } + ContextHandle::OpenViaDup => { + let mut args = buf.usizes(); + + let user_data = args.next().ok_or(Error::new(EINVAL))??; + + let context_verb = + ContextVerb::try_from_raw(user_data).ok_or(Error::new(EINVAL))?; + + match context_verb { + ContextVerb::ForceKill => { + if context::is_current(&context) { + //trace!("FORCEKILL SELF {} {}", context.read().debug_id, context.read().pid); + // The following functionality simplifies the cleanup step when detached threads + // terminate. + if let Some(post_unmap) = args.next() { + let base = post_unmap?; + let size = args.next().ok_or(Error::new(EINVAL))??; + + if size > 0 { + let addrsp = + Arc::clone(context.read(token.token()).addr_space()?); + let res = addrsp.munmap( + PageSpan::validate_nonempty( + VirtualAddress::new(base), + size, + ) + .ok_or(Error::new(EINVAL))?, + false, + token, + )?; + for r in res { + let _ = r.unmap(token); + } + } + } + crate::syscall::exit_this_context(None, token); + } else { + Err(Error::new(EPERM)) + } + } + _ => Err(Error::new(EINVAL)), + } + } + _ => Err(Error::new(EBADF)), + } + } + fn kreadoff( + &self, + _id: usize, + context: Arc, + buf: UserSliceWo, + offset: u64, + token: &mut CleanLockToken, + ) -> Result { + match self { + ContextHandle::Regs(kind) => { + union Output { + float: FloatRegisters, + int: IntRegisters, + env: EnvRegisters, + } + + let (output, size) = match kind { + RegsKind::Float => { + let context = context.read(token.token()); + // NOTE: The kernel will never touch floats + + ( + Output { + float: context.get_fx_regs(), + }, + size_of::(), + ) + } + RegsKind::Int => { + try_stop_context(context, token, |context, _| match context.regs() { + None => { + assert!(!context.running, "try_stop_context is broken, clearly"); + println!( + "{}:{}: Couldn't read registers from stopped process", + file!(), + line!() + ); + Err(Error::new(ENOTRECOVERABLE)) + } + Some(stack) => { + let mut regs = IntRegisters::default(); + stack.save(&mut regs); + Ok((Output { int: regs }, size_of::())) + } + })? + } + RegsKind::Env => ( + Output { + env: read_env_regs(context, token)?, + }, + size_of::(), + ), + }; + + let src_buf = + unsafe { slice::from_raw_parts(&output as *const _ as *const u8, size) }; + + buf.copy_common_bytes_from_slice(src_buf) + } + ContextHandle::AddrSpace { addrspace } => { + let Ok(offset) = usize::try_from(offset) else { + return Ok(0); + }; + let grants_to_skip = offset / size_of::(); + + // Output a list of grant descriptors, sufficient to allow relibc's fork() + // implementation to fmap MAP_SHARED grants. + let mut grants_read = 0; + + let mut dst = [GrantDesc::default(); 16]; + + let mut token = token.token(); + let addr_space = addrspace.acquire_read(token.downgrade()); + for (dst, (grant_base, grant_info)) in dst + .iter_mut() + .zip(addr_space.grants.iter().skip(grants_to_skip)) + { + *dst = GrantDesc { + base: grant_base.start_address().data(), + size: grant_info.page_count() * PAGE_SIZE, + flags: grant_info.grant_flags(), + // The !0 is not a sentinel value; the availability of `offset` is + // indicated by the GRANT_SCHEME flag. + offset: grant_info.file_ref().map_or(!0, |f| f.base_offset as u64), + }; + grants_read += 1; + } + for (src, chunk) in dst + .iter() + .take(grants_read) + .zip(buf.in_exact_chunks(size_of::())) + { + chunk.copy_exactly(src)?; + } + + Ok(grants_read * size_of::()) + } + + ContextHandle::Filetable { data, .. } => read_from(buf, data, offset), + ContextHandle::MmapMinAddr(addrspace) => { + let mut token = token.token(); + let addr = addrspace.acquire_read(token.downgrade()); + buf.write_usize(addr.mmap_min)?; + Ok(size_of::()) + } + ContextHandle::SchedAffinity => { + let mask = context.read(token.token()).sched_affinity.to_raw(); + + buf.copy_exactly(crate::cpu_set::mask_as_bytes(&mask))?; + Ok(size_of_val(&mask)) + } // TODO: Replace write() with SYS_SENDFD? + ContextHandle::Status { .. } => { + let status = { + let context = context.read(token.token()); + match context.status { + Status::Runnable | Status::Dead { excp: None } + if context.being_sigkilled => + { + ContextStatus::ForceKilled + } + Status::Dead { excp: None } => ContextStatus::Dead, + Status::Dead { excp: Some(excp) } => { + let (status, payload) = + buf.split_at(size_of::()).ok_or(Error::new(EINVAL))?; + status.copy_from_slice( + &(ContextStatus::UnhandledExcp as usize).to_ne_bytes(), + )?; + let len = payload.copy_common_bytes_from_slice(&excp)?; + return Ok(size_of::() + len); + } + Status::Runnable => ContextStatus::Runnable, + Status::Blocked => ContextStatus::Blocked, + Status::HardBlocked { + reason: HardBlockedReason::NotYetStarted, + } => ContextStatus::NotYetStarted, + Status::HardBlocked { + reason: HardBlockedReason::Stopped, + } => ContextStatus::Stopped, + _ => ContextStatus::Other, + } + }; + buf.copy_common_bytes_from_slice(&(status as usize).to_ne_bytes()) + } + ContextHandle::Attr => { + let mut debug_name = [0; 32]; + let c = &context.read(token.token()); + let (euid, egid, pid, name, prio) = + (c.euid, c.egid, c.pid as u32, c.name, c.prio as u32); + let min = name.len().min(debug_name.len()); + debug_name[..min].copy_from_slice(&name.as_bytes()[..min]); + buf.copy_common_bytes_from_slice(&ProcSchemeAttrs { + pid, + euid, + egid, + prio, + debug_name, + }) + } + Self::Groups => { + let c = &context.read(token.token()); + let max = buf.len() / size_of::(); + let count = c.groups.len().min(max); + for (chunk, gid) in buf.in_exact_chunks(size_of::()).zip(&c.groups).take(count) { + chunk.copy_from_slice(&gid.to_ne_bytes())?; + } + Ok(count * size_of::()) + } + ContextHandle::Sighandler => { + let data = match context.read(token.token()).sig { + Some(ref sig) => SetSighandlerData { + excp_handler: sig.excp_handler.map_or(0, NonZeroUsize::get), + user_handler: sig.user_handler.get(), + proc_control_addr: sig.procctl_off.into(), + thread_control_addr: sig.threadctl_off.into(), + }, + None => SetSighandlerData::default(), + }; + buf.copy_common_bytes_from_slice(&data) + } + + // TODO: Find a better way to switch address spaces, since they also require switching + // the instruction and stack pointer. Maybe remove `/regs` altogether and replace it + // with `/ctx` + _ => Err(Error::new(EBADF)), + } + } +} + +fn write_env_regs( + context: Arc, + regs: EnvRegisters, + token: &mut CleanLockToken, +) -> Result<()> { + if context::is_current(&context) { + context::current() + .write(token.token()) + .write_current_env_regs(regs) + } else { + try_stop_context(context, token, |context, _| context.write_env_regs(regs)) + } +} + +fn read_env_regs(context: Arc, token: &mut CleanLockToken) -> Result { + if context::is_current(&context) { + context::current() + .read(token.token()) + .read_current_env_regs() + } else { + try_stop_context(context, token, |context, _| context.read_env_regs()) + } +} diff --git a/src/scheme/serio.rs b/src/scheme/serio.rs new file mode 100644 index 0000000000..26505021ad --- /dev/null +++ b/src/scheme/serio.rs @@ -0,0 +1,157 @@ +//! PS/2 unfortunately requires a kernel driver to prevent race conditions due +//! to how status is utilized + +use syscall::data::GlobalSchemes; + +use crate::{ + event, + scheme::*, + sync::{CleanLockToken, RwLock, WaitQueue, L1}, + syscall::{ + flag::{EventFlags, EVENT_READ, O_NONBLOCK}, + usercopy::UserSliceWo, + }, +}; + +use super::StrOrBytes; + +/// Input queue +static INPUT: [WaitQueue; 2] = [WaitQueue::new(), WaitQueue::new()]; + +#[derive(Clone, Copy, PartialEq, Eq)] +enum HandleKind { + Device(usize), + SchemeRoot, +} + +#[derive(Clone, Copy)] +struct Handle { + kind: HandleKind, +} + +static HANDLES: RwLock> = RwLock::new(HandleMap::new()); + +/// Add to the input queue +pub fn serio_input(index: usize, data: u8, token: &mut CleanLockToken) { + crate::profiling::serio_command(index, data); + + INPUT[index].send(data, token); + + let ids: Vec = { + HANDLES + .read(token.token()) + .iter() + .map(|(id, _)| *id) + .collect() + }; + + for id in ids { + event::trigger(GlobalSchemes::Serio.scheme_id(), id, EVENT_READ, token); + } +} + +pub struct SerioScheme; + +impl KernelScheme for SerioScheme { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + let id = HANDLES.write(token.token()).insert(Handle { + kind: HandleKind::SchemeRoot, + }); + Ok(id) + } + + fn kopenat( + &self, + id: usize, + user_buf: StrOrBytes, + _flags: usize, + _fcntl_flags: u32, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + { + let handles = HANDLES.read(token.token()); + let handle = handles.get(id)?; + + if !matches!(handle.kind, HandleKind::SchemeRoot) { + return Err(Error::new(EACCES)); + } + } + + let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?; + if ctx.uid != 0 { + return Err(Error::new(EPERM)); + } + + let index = path.parse::().or(Err(Error::new(ENOENT)))?; + if index >= INPUT.len() { + return Err(Error::new(ENOENT)); + } + + let id = HANDLES.write(token.token()).insert(Handle { + kind: HandleKind::Device(index), + }); + + Ok(OpenResult::SchemeLocal(id, InternalFlags::empty())) + } + + fn fevent( + &self, + id: usize, + _flags: EventFlags, + token: &mut CleanLockToken, + ) -> Result { + let handles = HANDLES.read(token.token()); + let handle = handles.get(id)?; + + if let HandleKind::Device(_) = handle.kind { + Ok(EventFlags::empty()) + } else { + Err(Error::new(EBADF)) + } + } + + fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + HANDLES.read(token.token()).get(id)?; + Ok(()) + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + HANDLES.write(token.token()).remove(id)?; + Ok(()) + } + + fn kread( + &self, + id: usize, + buf: UserSliceWo, + flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let handle = *HANDLES.read(token.token()).get(id)?; + + let index = match handle.kind { + HandleKind::Device(index) => index, + HandleKind::SchemeRoot => return Err(Error::new(EBADF)), + }; + + INPUT[index].receive_into_user( + buf, + flags & O_NONBLOCK as u32 == 0, + "SerioScheme::read", + token, + ) + } + + fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result { + let handle = *HANDLES.read(token.token()).get(id)?; + + let path = match handle.kind { + HandleKind::Device(index) => format!("serio:{}", index).into_bytes(), + HandleKind::SchemeRoot => return Err(Error::new(EBADF)), + }; + + buf.copy_common_bytes_from_slice(&path) + } +} diff --git a/src/scheme/sys/block.rs b/src/scheme/sys/block.rs new file mode 100644 index 0000000000..8d3b8e4f4e --- /dev/null +++ b/src/scheme/sys/block.rs @@ -0,0 +1,34 @@ +use alloc::{string::String, vec::Vec}; +use core::fmt::Write; + +use crate::{context::contexts, sync::CleanLockToken, syscall::error::Result}; + +pub fn resource(token: &mut CleanLockToken) -> Result> { + let mut string = String::new(); + + { + let mut rows = Vec::new(); + { + let mut contexts = contexts(token.downgrade()); + let (contexts, mut token) = contexts.token_split(); + for context_lock in contexts.iter() { + let context = context_lock.read(token.token()); + rows.push((context.pid, context.name, context.status_reason)); + } + } + rows.sort_by_key(|row| row.0); + + for row in rows.iter() { + let id: usize = row.0; + let name = &row.1; + + let _ = writeln!(string, "{}: {}", id, name); + + if !row.2.is_empty() { + let _ = writeln!(string, " {}", row.2); + } + } + } + + Ok(string.into_bytes()) +} diff --git a/src/scheme/sys/context.rs b/src/scheme/sys/context.rs new file mode 100644 index 0000000000..313927a08e --- /dev/null +++ b/src/scheme/sys/context.rs @@ -0,0 +1,186 @@ +use alloc::{ + borrow::ToOwned, + string::{String, ToString}, + vec::Vec, +}; +use core::fmt::Write; + +use crate::{context, context::contexts, sync::CleanLockToken, syscall::error::Result}; + +pub fn resource(token: &mut CleanLockToken) -> Result> { + let mut string = format!( + "{:<6}{:<6}{:<6}{:<6}{:<6}{:<11}{:<12}{:<8}{:<8}{}\n", + "PID", "EUID", "EGID", "STAT", "CPU", "AFFINITY", "TIME", "PRIVATE", "SHARED", "NAME" + ); + + let mut rows = Vec::new(); + { + let mut contexts = contexts(token.downgrade()); + let (contexts, mut token) = contexts.token_split(); + for context_ref in contexts.iter() { + let context = context_ref.read(token.token()); + let addr_space = context.addr_space().map(|a| a.clone()); + + let affinity = context.sched_affinity.to_string(); + let cpu_time_s = context.cpu_time / crate::time::NANOS_PER_SEC; + let cpu_time_ns = context.cpu_time % crate::time::NANOS_PER_SEC; + let mut memory = context.kfx.len(); + if let Some(ref kstack) = context.kstack { + memory += kstack.len(); + } + let (status, cpuid) = (context.status.clone(), context.cpu_id); + let (pid, euid, egid) = (context.pid, context.euid, context.egid); + let (running, is_awake) = (context.running, context.wake.is_some()); + let name = context.name; + drop(context); + + let heap = match addr_space { + Ok(addr_space) => { + let addr_space_guard = addr_space.acquire_read(token.downgrade()); + let mut private_memory = 0; + let mut shared_memory = 0; + // TODO: All user programs must have some grant in order for executable memory to even + // exist, but is this a good indicator of whether it is user or kernel? + let is_kernel = addr_space_guard.grants.is_empty(); + for (_base, info) in addr_space_guard.grants.iter() { + // wrap as method? + match info.provider { + context::memory::Provider::Allocated { .. } => { + private_memory += info.page_count() * crate::memory::PAGE_SIZE + } + // Excluded because it is not allocable by user, whether + // this region is counted toward usable memory remain unknown + context::memory::Provider::PhysBorrowed { .. } => {} + _ => shared_memory += info.page_count() * crate::memory::PAGE_SIZE, + } + } + Some((private_memory, shared_memory, is_kernel)) + } + Err(_) => None, + }; + + let mut stat_string = String::new(); + stat_string.push(match heap { + Some((_, _, is_kernel)) => { + if is_kernel { + 'K' + } else { + 'U' + } + } + _ => 'R', + }); + match status { + context::Status::Runnable => { + stat_string.push('R'); + } + context::Status::Blocked | context::Status::HardBlocked { .. } => { + if is_awake { + stat_string.push('S'); + } else { + stat_string.push('B'); + } + } + context::Status::Dead { .. } => { + stat_string.push('Z'); + } + } + if running { + stat_string.push('+'); + } + + let cpu_string = match cpuid { + Some(cpu_id) => { + format!("{cpu_id}") + } + _ => "?".to_owned(), + }; + let cpu_time_string = format!( + "{:02}:{:02}:{:02}.{:02}", + cpu_time_s / 3600, + (cpu_time_s / 60) % 60, + cpu_time_s % 60, + cpu_time_ns / 10_000_000 + ); + + let (priv_memory, shared_memory) = if let Some((privm, shrdm, _)) = heap { + (memory + privm, shrdm) + } else { + (memory, 0) + }; + + rows.push(( + pid, + euid, + egid, + stat_string, + cpu_string, + affinity, + cpu_time_string, + format_bytes(priv_memory), + format_bytes(shared_memory), + name, + )); + } + } + rows.sort_by_key(|row| row.0); + + for ( + pid, + euid, + egid, + stat_string, + cpu_string, + affinity, + cpu_time_string, + priv_memory_string, + shared_memory_string, + name, + ) in rows + { + let _ = writeln!( + string, + "{:<6}{:<6}{:<6}{:<6}{:<6}{:<11}{:<12}{:<8}{:<8}{}", + pid, + euid, + egid, + stat_string, + cpu_string, + affinity, + cpu_time_string, + priv_memory_string, + shared_memory_string, + name, + ); + } + + Ok(string.into_bytes()) +} + +fn format_bytes(memory: usize) -> String { + const GB: usize = 1024 * 1024 * 1024; + const MB: usize = 1024 * 1024; + const KB: usize = 1024; + + if memory > GB { + format_bytes_inner(memory, GB, "GB") + } else if memory > MB { + format_bytes_inner(memory, MB, "MB") + } else if memory > KB { + format_bytes_inner(memory, KB, "KB") + } else { + format!("{memory} B") + } +} + +fn format_bytes_inner(memory: usize, divisor: usize, suffix: &'static str) -> String { + let mut s = format!("{}", memory / divisor); + if s.len() == 1 { + let _ = write!(s, ".{:02}", (memory % divisor) / (divisor / 100)); + } else if s.len() == 2 { + let _ = write!(s, ".{:01}", (memory % divisor) / (divisor / 10)); + } + + let _ = write!(s, " {suffix}"); + s +} diff --git a/src/scheme/sys/cpu.rs b/src/scheme/sys/cpu.rs new file mode 100644 index 0000000000..7c8bf983af --- /dev/null +++ b/src/scheme/sys/cpu.rs @@ -0,0 +1,16 @@ +use alloc::vec::Vec; + +use crate::{ + arch::device::cpu::cpu_info, + sync::CleanLockToken, + syscall::error::{Error, Result, EIO}, +}; + +pub fn resource(_token: &mut CleanLockToken) -> Result> { + let mut string = format!("CPUs: {}\n", crate::cpu_count()); + + match cpu_info(&mut string) { + Ok(()) => Ok(string.into_bytes()), + Err(_) => Err(Error::new(EIO)), + } +} diff --git a/src/scheme/sys/exe.rs b/src/scheme/sys/exe.rs new file mode 100644 index 0000000000..947879dac0 --- /dev/null +++ b/src/scheme/sys/exe.rs @@ -0,0 +1,11 @@ +use alloc::vec::Vec; + +use crate::{context, sync::CleanLockToken, syscall::error::Result}; + +pub fn resource(token: &mut CleanLockToken) -> Result> { + Ok(context::current() + .read(token.token()) + .name + .as_bytes() + .to_vec()) +} diff --git a/src/scheme/sys/fdstat.rs b/src/scheme/sys/fdstat.rs new file mode 100644 index 0000000000..59965be76f --- /dev/null +++ b/src/scheme/sys/fdstat.rs @@ -0,0 +1,107 @@ +use crate::{ + alloc::string::ToString, + context::{contexts, file::LockedFileDescription, memory::AddrSpaceWrapper}, + scheme::{self, handles, KernelSchemes}, + sync::CleanLockToken, + syscall::error::Result, +}; +use alloc::{borrow::Cow, string::String, sync::Arc, vec::Vec}; +use core::{fmt::Write, hash::Hash}; +use hashbrown::HashMap; + +#[derive(Debug)] +struct Ref(Arc); +impl Hash for Ref { + fn hash(&self, state: &mut H) { + state.write_usize(Arc::as_ptr(&self.0) as usize); + } +} +impl PartialEq for Ref { + fn eq(&self, other: &Self) -> bool { + Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0) + } +} +impl Eq for Ref {} +#[derive(Default)] +struct Descr { + owners: HashMap, String>, + scheme: Cow<'static, str>, + number: usize, +} + +#[cfg_attr(not(feature = "sys_fdstat"), expect(dead_code))] +pub fn resource(token: &mut CleanLockToken) -> Result> { + let mut map = HashMap::, Descr>::new(); + let mut report = String::new(); + let mut schemes_guard = handles().read(token.token()); + let (schemes, mut token) = schemes_guard.token_split(); + + let mut contexts = contexts(token.token()); + let (contexts, mut token) = contexts.token_split(); + 'contexts: for context in contexts.iter() { + let mut context_guard = context.read(token.token()); + let (context, token) = context_guard.token_split(); + let mut files_guard = context.files.read(token); + let (files, mut token) = files_guard.token_split(); + writeln!(report, "'{}' {{", context.name).unwrap(); + + for file in files.iter().filter_map(|f| f.clone()) { + writeln!( + report, + "\tS{}W{}", + Arc::strong_count(&file.description), + Arc::weak_count(&file.description) + ) + .unwrap(); + let fr = Ref(file.description.clone()); + let Some(a) = context.addr_space.clone() else { + continue 'contexts; + }; + let descr = map.entry(fr).or_default(); + + let scheme_id = file.description.read(token.token()).scheme; + let scheme = schemes.get(&scheme_id); + descr + .owners + .entry(Ref(a)) + .or_insert(context.name.clone().to_string()); + descr.scheme = match scheme { + Some(scheme::Handle::SchemeCreationCapability) => "SchemeCreationCapability".into(), + Some(scheme::Handle::Scheme(KernelSchemes::Global(g))) => g.as_str().into(), + Some(scheme::Handle::Scheme(KernelSchemes::User(scheme))) => { + format!("[user {:p}]", Arc::as_ptr(&scheme.inner)).into() + } + Some(scheme::Handle::Scheme(KernelSchemes::SchemeMgr)) => "SchemeMgr".into(), + _ => format!("[unknown {}]", scheme_id.0).into(), + }; + descr.number = file.description.read(token.token()).number; + } + writeln!(report, "}}").unwrap(); + } + writeln!(report, "==========").unwrap(); + let mut singletons = 0; + for (fr, ma) in map.iter() { + if ma.owners.len() == 1 { + singletons += 1; + } + writeln!( + report, + "{:p}: {:?}; {}:{}", + fr.0, + ma.owners.values().cloned().collect::>(), + ma.scheme, + ma.number, + ) + .unwrap(); + } + writeln!(report, "==========").unwrap(); + writeln!( + report, + "{} singletons out of {} total", + singletons, + map.len() + ) + .unwrap(); + + Ok(report.into()) +} diff --git a/src/scheme/sys/iostat.rs b/src/scheme/sys/iostat.rs new file mode 100644 index 0000000000..d3bdd81dba --- /dev/null +++ b/src/scheme/sys/iostat.rs @@ -0,0 +1,129 @@ +use crate::{ + context::{ + self, + memory::{Grant, PageSpan}, + }, + memory::PAGE_SIZE, + scheme, + sync::CleanLockToken, + syscall::{ + error::Result, + flag::MapFlags, + usercopy::{UserSlice, UserSliceRw}, + }, +}; +use alloc::{string::String, sync::Arc, vec::Vec}; +use core::{fmt::Write, num::NonZeroUsize, str}; + +fn inner(fpath_user: UserSliceRw, token: &mut CleanLockToken) -> Result> { + let mut string = String::new(); + let mut fpath_kernel = [0; PAGE_SIZE]; + + { + let mut rows = Vec::new(); + { + let mut contexts = context::contexts(token.downgrade()); + let (contexts, mut token) = contexts.token_split(); + for context_ref in contexts.iter() { + let mut current = context_ref.read(token.token()); + let (context, mut token) = current.token_split(); + rows.push(( + context.pid, + context.name, + context.files.read(token.token()).clone(), + )); + } + } + rows.sort_by_key(|row| row.0); + + for (id, name, fs) in rows.iter() { + let _ = writeln!(string, "{}: {}", id, name); + + for (fd, f) in fs.enumerate() { + let file = match *f { + None => continue, + Some(ref file) => file.clone(), + }; + + let (scheme, number, flags) = { + let desc = file.description.read(token.token()); + (desc.scheme, desc.number, desc.flags) + }; + + let _ = write!( + string, + "{} {:>4}: {:>8} {:>8} {:>08X}: ", + if fd & syscall::UPPER_FDTBL_TAG == 0 { + " " + } else { + "U" + }, + fd & !syscall::UPPER_FDTBL_TAG, + scheme.get(), + number, + flags + ); + + let scheme = { + match scheme::get_scheme(token.token(), scheme) { + Ok(scheme) => scheme.clone(), + Err(_) => { + let _ = writeln!(string, "no scheme",); + continue; + } + } + }; + + match scheme.kfpath(number, fpath_user.reinterpret_unchecked(), token) { + Ok(path_len) => { + fpath_user.copy_to_slice(&mut fpath_kernel)?; + let fname = str::from_utf8(&fpath_kernel[..path_len]).unwrap_or("?"); + let _ = writeln!(string, "{}", fname); + } + Err(err) => { + let _ = writeln!(string, "{}", err); + } + } + } + } + } + + Ok(string.into_bytes()) +} + +pub fn resource(token: &mut CleanLockToken) -> Result> { + let page_count = NonZeroUsize::new(1).unwrap(); + let fpath_page = { + let addr_space = Arc::clone(context::current().read(token.token()).addr_space()?); + addr_space + .acquire_write(token.token().downgrade()) + .mmap_anywhere( + &addr_space, + page_count, + MapFlags::PROT_READ | MapFlags::PROT_WRITE, + |page, flags, mapper, flusher| { + let shared = false; + Ok(Grant::zeroed( + PageSpan::new(page, page_count.get()), + flags, + mapper, + flusher, + shared, + )?) + }, + )? + }; + + let res = UserSlice::rw(fpath_page.start_address().data(), PAGE_SIZE) + .and_then(|fpath_user| inner(fpath_user, token)); + + { + let addr_space = Arc::clone(context::current().read(token.token()).addr_space()?); + let res = addr_space.munmap(PageSpan::new(fpath_page, page_count.get()), false, token)?; + for r in res { + let _ = r.unmap(token); + } + } + + res +} diff --git a/src/scheme/sys/irq.rs b/src/scheme/sys/irq.rs new file mode 100644 index 0000000000..f76de90573 --- /dev/null +++ b/src/scheme/sys/irq.rs @@ -0,0 +1,17 @@ +use alloc::{string::String, vec::Vec}; +use core::fmt::Write; + +use crate::{sync::CleanLockToken, syscall::error::Result}; + +pub fn resource(_token: &mut CleanLockToken) -> Result> { + let mut string = String::new(); + + { + let counts = crate::scheme::irq::COUNTS.lock(); + for (i, count) in counts.iter().enumerate() { + let _ = writeln!(string, "{}: {}", i, count); + } + } + + Ok(string.into_bytes()) +} diff --git a/src/scheme/sys/log.rs b/src/scheme/sys/log.rs new file mode 100644 index 0000000000..8cdd43dfb0 --- /dev/null +++ b/src/scheme/sys/log.rs @@ -0,0 +1,16 @@ +use alloc::vec::Vec; + +use crate::{log::LOG, sync::CleanLockToken, syscall::error::Result}; + +pub fn resource(_token: &mut CleanLockToken) -> Result> { + let mut vec = Vec::new(); + + if let Some(ref log) = *LOG.lock() { + let slices = log.read(); + vec.reserve_exact(slices.0.len() + slices.1.len()); + vec.extend_from_slice(slices.0); + vec.extend_from_slice(slices.1); + } + + Ok(vec) +} diff --git a/src/scheme/sys/mod.rs b/src/scheme/sys/mod.rs new file mode 100644 index 0000000000..8f26187a79 --- /dev/null +++ b/src/scheme/sys/mod.rs @@ -0,0 +1,333 @@ +// TODO: This scheme can be simplified significantly, and through it, several other APIs where it's +// dubious whether they require dedicated schemes (like irq, dtb, acpi). In particular, the kernel +// could abandon the filesystem-like APIs here in favor of SYS_CALL, and instead let userspace wrap +// those to say shell-accessible fs-like APIs. + +use ::syscall::{ + dirent::{DirEntry, DirentBuf, DirentKind}, + EACCES, EINVAL, EIO, EISDIR, ENOTDIR, EPERM, +}; +use alloc::{sync::Arc, vec::Vec}; +use core::str; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use crate::arch::interrupt; +use crate::{ + context::file::InternalFlags, + sync::{CleanLockToken, RwLock, L1}, + syscall::{ + data::Stat, + error::{Error, Result, EBADF, ENOENT}, + flag::{MODE_DIR, MODE_FILE}, + usercopy::{UserSliceRo, UserSliceWo}, + }, +}; + +use super::{CallerCtx, HandleMap, KernelScheme, OpenResult, StrOrBytes}; + +mod block; +mod context; +mod cpu; +mod exe; +mod fdstat; +mod iostat; +mod irq; +mod log; +mod stat; +mod syscall; +mod uname; + +enum Handle { + TopLevel, + Resource { + path: &'static str, + kind: Kind, + data: Arc>>>, + }, + SchemeRoot, +} + +#[derive(Clone, Copy)] +enum Kind { + Rd(fn(&mut CleanLockToken) -> Result>), + Wr(fn(&[u8], &mut CleanLockToken) -> Result), +} +use Kind::{Rd, Wr}; +impl Kind { + fn generate_data(&self, token: &mut CleanLockToken) -> Result> { + match self { + Rd(handler) => handler(token), + Wr(_) => Err(Error::new(EISDIR)), + } + } +} + +/// System information scheme +pub struct SysScheme; +static HANDLES: RwLock> = RwLock::new(HandleMap::new()); + +const FILES: &[(&str, Kind)] = &[ + ("block", Rd(block::resource)), + ("context", Rd(context::resource)), + ("cpu", Rd(cpu::resource)), + #[cfg(feature = "sys_fdstat")] + ("fdstat", Rd(fdstat::resource)), + ("exe", Rd(exe::resource)), + ("iostat", Rd(iostat::resource)), + ("irq", Rd(irq::resource)), + ("log", Rd(log::resource)), + ("syscall", Rd(syscall::resource)), + ("uname", Rd(uname::resource)), + ("env", Rd(|_| Ok(Vec::from(crate::startup::init_env())))), + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + ("spurious_irq", Rd(interrupt::irq::spurious_irq_resource)), + ("stat", Rd(stat::resource)), + // Disabled because the debugger is inherently unsafe and probably will break the system. + /* + ("trigger_debugger", Rd(|token| unsafe { + crate::debugger::debugger(None, token); + Ok(Vec::new()) + })), + */ + ( + "update_time_offset", + Wr(crate::time::sys_update_time_offset), + ), + ( + "kstop", + Wr(|arg, token| unsafe { + match arg.trim_ascii() { + b"shutdown" => crate::stop::kstop(token), + b"reset" => crate::stop::kreset(), + b"emergency_reset" => crate::stop::emergency_reset(), + _ => Err(Error::new(EINVAL)), + } + }), + ), +]; + +impl KernelScheme for SysScheme { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + let id = HANDLES.write(token.token()).insert(Handle::SchemeRoot); + Ok(id) + } + fn kopenat( + &self, + id: usize, + user_buf: StrOrBytes, + _flags: usize, + _fcntl_flags: u32, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + if !matches!(HANDLES.read(token.token()).get(id)?, Handle::SchemeRoot) { + return Err(Error::new(EACCES)); + } + + let path = user_buf + .as_str() + .or(Err(Error::new(EINVAL)))? + .trim_matches('/'); + + if path.is_empty() { + let id = HANDLES.write(token.token()).insert(Handle::TopLevel); + + Ok(OpenResult::SchemeLocal(id, InternalFlags::POSITIONED)) + } else { + //Have to iterate to get the path without allocation + let entry = FILES + .iter() + .find(|(entry_path, _)| *entry_path == path) + .ok_or(Error::new(ENOENT))?; + + if matches!(entry.1, Wr(_)) && ctx.uid != 0 { + return Err(Error::new(EPERM)); + } + + // TODO: Initialize resources during openat to use them as a snapshot. + let id = HANDLES.write(token.token()).insert(Handle::Resource { + path: entry.0, + kind: entry.1, + data: Arc::new(RwLock::new(None)), + }); + Ok(OpenResult::SchemeLocal(id, InternalFlags::POSITIONED)) + } + } + + fn fsize(&self, id: usize, token: &mut CleanLockToken) -> Result { + let (kind, data_lock) = { + match HANDLES.read(token.token()).get(id)? { + Handle::TopLevel => return Ok(0), + Handle::Resource { kind, data, .. } => (*kind, data.clone()), + Handle::SchemeRoot => return Err(Error::new(EBADF)), + } + }; + if matches!(kind, Kind::Wr(_)) { + return Ok(0); + } + let is_data_none = data_lock.write(token.token()).is_none(); + if is_data_none { + let new_data = kind.generate_data(token)?; + let mut data_guard = data_lock.write(token.token()); + if data_guard.is_none() { + *data_guard = Some(new_data); + } + } + let data_guard = data_lock.read(token.token()); + let data = data_guard.as_ref().ok_or(Error::new(EIO))?; + + Ok(data.len() as u64) + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + HANDLES.write(token.token()).remove(id)?; + Ok(()) + } + fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result { + let path = match HANDLES.read(token.token()).get(id)? { + Handle::TopLevel => "", + Handle::Resource { path, .. } => path, + Handle::SchemeRoot => return Err(Error::new(EBADF)), + }; + + const FIRST: &[u8] = b"sys:"; + let mut bytes_read = buf.copy_common_bytes_from_slice(FIRST)?; + + if let Some(remaining) = buf.advance(FIRST.len()) { + bytes_read += remaining.copy_common_bytes_from_slice(path.as_bytes())?; + } + + Ok(bytes_read) + } + fn kreadoff( + &self, + id: usize, + buffer: UserSliceWo, + pos: u64, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let Ok(pos) = usize::try_from(pos) else { + return Ok(0); + }; + + let (kind, data_lock) = { + match HANDLES.read(token.token()).get(id)? { + Handle::Resource { kind, data, .. } => (*kind, data.clone()), + _ => return Err(Error::new(EBADF)), + } + }; + let is_data_none = data_lock.write(token.token()).is_none(); + if is_data_none { + let new_data = kind.generate_data(token)?; + let mut data_guard = data_lock.write(token.token()); + if data_guard.is_none() { + *data_guard = Some(new_data); + } + } + let data_guard = data_lock.read(token.token()); + let data = data_guard.as_ref().ok_or(Error::new(EIO))?; + let avail_buf = data.get(pos..).unwrap_or(&[]); + buffer.copy_common_bytes_from_slice(avail_buf) + } + fn kwriteoff( + &self, + id: usize, + buffer: UserSliceRo, + _pos: u64, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let (handler, intermediate, len) = match HANDLES.read(token.token()).get(id)? { + Handle::TopLevel + | Handle::Resource { + kind: Kind::Rd(_), .. + } => return Err(Error::new(EISDIR)), + Handle::Resource { + kind: Kind::Wr(handler), + .. + } => { + let mut intermediate = [0_u8; 256]; + let len = buffer.copy_common_bytes_to_slice(&mut intermediate)?; + (*handler, intermediate, len) + } + Handle::SchemeRoot => return Err(Error::new(EBADF)), + }; + handler(&intermediate[..len], token) + } + fn getdents( + &self, + id: usize, + buf: UserSliceWo, + header_size: u16, + first_index: u64, + token: &mut CleanLockToken, + ) -> Result { + let Ok(first_index) = usize::try_from(first_index) else { + return Ok(0); + }; + match HANDLES.read(token.token()).get(id)? { + Handle::Resource { .. } => Err(Error::new(ENOTDIR)), + Handle::TopLevel => { + let mut buf = DirentBuf::new(buf, header_size).ok_or(Error::new(EIO))?; + for (this_idx, (name, _)) in FILES.iter().enumerate().skip(first_index) { + buf.entry(DirEntry { + inode: this_idx as u64, + next_opaque_id: this_idx as u64 + 1, + kind: DirentKind::Regular, + name, + })?; + } + Ok(buf.finalize()) + } + Handle::SchemeRoot => Err(Error::new(EBADF)), + } + } + + fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + let stat_base = { + let handles = HANDLES.read(token.token()); + match handles.get(id)? { + Handle::Resource { kind, data, .. } => Some((*kind, data.clone())), + Handle::TopLevel => None, + Handle::SchemeRoot => return Err(Error::new(EBADF)), + } + }; + let stat = if let Some((kind, data_lock)) = stat_base { + let is_data_none = data_lock.write(token.token()).is_none(); + if is_data_none { + let new_data = kind.generate_data(token)?; + let mut data_guard = data_lock.write(token.token()); + if data_guard.is_none() { + *data_guard = Some(new_data); + } + } + let data_guard = data_lock.read(token.token()); + let data = data_guard.as_ref().ok_or(Error::new(EIO))?; + let size = match kind { + Kind::Rd(_) => data.len() as u64, + Kind::Wr(_) => 0, + }; + Stat { + st_mode: 0o666 | MODE_FILE, + st_uid: 0, + st_gid: 0, + st_size: size, + ..Default::default() + } + } else { + Stat { + st_mode: 0o444 | MODE_DIR, + st_uid: 0, + st_gid: 0, + st_size: 0, + ..Default::default() + } + }; + buf.copy_exactly(&stat)?; + + Ok(()) + } +} diff --git a/src/scheme/sys/stat.rs b/src/scheme/sys/stat.rs new file mode 100644 index 0000000000..877d252f4f --- /dev/null +++ b/src/scheme/sys/stat.rs @@ -0,0 +1,96 @@ +use core::fmt::Write as _; + +use crate::{ + context::{contexts, Status}, + cpu_stats::{get_context_switch_count, get_contexts_count, irq_counts}, + percpu::get_all_stats, + sync::CleanLockToken, + syscall::error::Result, + time::START, +}; +use alloc::{string::String, vec::Vec}; + +/// Get the sys:stat data as displayed to the user. +pub fn resource(token: &mut CleanLockToken) -> Result> { + let start_time_sec = *START.lock(token.token()) / 1_000_000_000; + + let (contexts_running, contexts_blocked) = get_contexts_stats(token); + let res = format!( + "{}{}\n\ + boot_time: {start_time_sec}\n\ + context_switches: {}\n\ + contexts_created: {}\n\ + contexts_running: {contexts_running}\n\ + contexts_blocked: {contexts_blocked}", + get_cpu_stats(), + get_irq_stats(), + get_context_switch_count(), + get_contexts_count(), + ); + + Ok(res.into_bytes()) +} + +/// Formats CPU stats. +fn get_cpu_stats() -> String { + let mut cpu_data = String::new(); + let stats = get_all_stats(); + + let mut total_user = 0; + let mut total_nice = 0; + let mut total_kernel = 0; + let mut total_idle = 0; + let mut total_irq = 0; + for (id, stat) in stats { + total_user += stat.user; + total_nice += stat.nice; + total_kernel += stat.kernel; + total_idle += stat.idle; + total_irq += stat.irq; + let _ = writeln!(&mut cpu_data, "cpu{} {}", id.get(), stat); + } + format!( + "cpu {total_user} {total_nice} {total_kernel} {total_idle} {total_irq}\n\ + {cpu_data}" + ) +} + +/// Formats IRQ stats. +fn get_irq_stats() -> String { + let irq = irq_counts(); + let mut irq_total = 0; + let mut output = String::with_capacity(64); + for &c in irq.iter() { + irq_total += c; + } + let _ = write!(output, "IRQs {}", irq_total); + for &c in irq.iter() { + let _ = write!(output, " {}", c); + } + + output +} + +/// Format contexts stats. +fn get_contexts_stats(token: &mut CleanLockToken) -> (u64, u64) { + let mut running = 0; + let mut blocked = 0; + + let statuses = { + let mut contexts = contexts(token.downgrade()); + let (contexts, mut token) = contexts.token_split(); + contexts + .iter() + .map(|context| context.read(token.token()).status.clone()) + .collect::>() + }; + + for status in statuses { + if matches!(status, Status::Runnable) { + running += 1; + } else if !matches!(status, Status::Dead { .. }) { + blocked += 1; + } + } + (running, blocked) +} diff --git a/src/scheme/sys/syscall.rs b/src/scheme/sys/syscall.rs new file mode 100644 index 0000000000..8bce7ab90c --- /dev/null +++ b/src/scheme/sys/syscall.rs @@ -0,0 +1,39 @@ +use alloc::{string::String, vec::Vec}; +use core::fmt::Write; + +use crate::{ + context::contexts, + sync::CleanLockToken, + syscall::{self, error::Result}, +}; + +pub fn resource(token: &mut CleanLockToken) -> Result> { + let mut string = String::new(); + + { + let mut rows = Vec::new(); + { + let mut contexts = contexts(token.downgrade()); + let (contexts, mut token) = contexts.token_split(); + for context_ref in contexts.iter() { + let context = context_ref.read(token.token()); + rows.push((context.pid, context.name, context.current_syscall())); + } + } + rows.sort_by_key(|row| row.0); + + for &(id, ref name, sc) in rows.iter() { + let _ = writeln!(string, "{}: {}", id, name); + + if let Some([a, b, c, d, e, f, g]) = sc { + let _ = writeln!( + string, + " {}", + syscall::debug::format_call(a, b, c, d, e, f, g) + ); + } + } + } + + Ok(string.into_bytes()) +} diff --git a/src/scheme/sys/uname.rs b/src/scheme/sys/uname.rs new file mode 100644 index 0000000000..5b7ef97003 --- /dev/null +++ b/src/scheme/sys/uname.rs @@ -0,0 +1,12 @@ +use crate::{sync::CleanLockToken, syscall::error::Result}; +use alloc::vec::Vec; + +pub fn resource(_token: &mut CleanLockToken) -> Result> { + Ok(format!( + "Redox\n{}\n{}\n{}\n", + env!("CARGO_PKG_VERSION"), + env!("TARGET").split('-').next().unwrap(), + option_env!("COOKBOOK_SOURCE_IDENT").unwrap_or("") + ) + .into_bytes()) +} diff --git a/src/scheme/time.rs b/src/scheme/time.rs new file mode 100644 index 0000000000..4ec3aed320 --- /dev/null +++ b/src/scheme/time.rs @@ -0,0 +1,217 @@ +use alloc::vec::Vec; +use core::{fmt, str}; +use syscall::data::GlobalSchemes; + +use crate::{ + context::{file::InternalFlags, timeout}, + sync::{CleanLockToken, RwLock, L1}, + syscall::{ + data::TimeSpec, + error::*, + flag::{EventFlags, CLOCK_MONOTONIC, CLOCK_REALTIME}, + usercopy::{UserSliceRo, UserSliceWo}, + }, + time, +}; + +use super::{CallerCtx, HandleMap, KernelScheme, OpenResult, SchemeExt, StrOrBytes}; + +#[derive(Clone)] +enum Handle { + SchemeRoot, + Clock(TimeSchemeHandle), +} + +static HANDLES: RwLock> = RwLock::new(HandleMap::new()); + +pub struct TimeScheme; + +#[derive(Clone)] +pub enum TimeSchemeKind { + Default, + ClockGettime, + ClockGetres, + Timer, +} + +impl fmt::Display for TimeSchemeKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + TimeSchemeKind::Default => write!(f, ""), + TimeSchemeKind::ClockGettime => write!(f, "gettime"), + TimeSchemeKind::ClockGetres => write!(f, "getres"), + TimeSchemeKind::Timer => write!(f, "timer"), + } + } +} + +#[derive(Clone)] +pub struct TimeSchemeHandle { + clock: usize, + kind: TimeSchemeKind, +} + +impl KernelScheme for TimeScheme { + fn scheme_root(&self, token: &mut CleanLockToken) -> Result { + let id = HANDLES.write(token.token()).insert(Handle::SchemeRoot); + Ok(id) + } + fn kopenat( + &self, + id: usize, + user_buf: StrOrBytes, + _flags: usize, + _fcntl_flags: u32, + _ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + { + let handles = HANDLES.read(token.token()); + let handle = handles.get(id)?; + + if !matches!(handle, Handle::SchemeRoot) { + return Err(Error::new(EACCES)); + } + } + + let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?; + let path_parts: Vec<&str> = path.split("/").collect(); + let clock = path_parts[0] + .parse::() + .map_err(|_| Error::new(ENOENT))?; + let kind = match path_parts.get(1).map(|e| e.as_ref()) { + None | Some("") => TimeSchemeKind::Default, + Some("gettime") => TimeSchemeKind::ClockGettime, + Some("getres") => TimeSchemeKind::ClockGetres, + Some("timer") => TimeSchemeKind::Timer, + Some(_) => return Err(Error::new(ENOENT)), + }; + + match clock { + CLOCK_REALTIME => (), + CLOCK_MONOTONIC => (), + _ => return Err(Error::new(ENOENT)), + } + + let id = HANDLES + .write(token.token()) + .insert(Handle::Clock(TimeSchemeHandle { clock, kind })); + + Ok(OpenResult::SchemeLocal(id, InternalFlags::empty())) + } + + fn fcntl( + &self, + _id: usize, + _cmd: usize, + _arg: usize, + _token: &mut CleanLockToken, + ) -> Result { + Ok(0) + } + + fn fevent( + &self, + id: usize, + _flags: EventFlags, + token: &mut CleanLockToken, + ) -> Result { + HANDLES + .read(token.token()) + .get(id) + .and(Ok(EventFlags::empty())) + } + + fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + HANDLES.read(token.token()).get(id)?; + Ok(()) + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + HANDLES.write(token.token()).remove(id).and(Ok(())) + } + fn kread( + &self, + id: usize, + buf: UserSliceWo, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let handle = match HANDLES.read(token.token()).get(id)? { + Handle::Clock(handle) => handle.clone(), + Handle::SchemeRoot => return Err(Error::new(EBADF)), + }; + + let mut bytes_read = 0; + + for current_chunk in buf.in_exact_chunks(size_of::()) { + let arch_time = match (handle.clock, handle.kind.clone()) { + (CLOCK_REALTIME, TimeSchemeKind::Default | TimeSchemeKind::ClockGettime) => { + time::realtime(token) + } + (CLOCK_MONOTONIC, TimeSchemeKind::Default | TimeSchemeKind::ClockGettime) => { + time::monotonic(token) + } + (CLOCK_REALTIME, TimeSchemeKind::ClockGetres) => time::realtime_resolution(), + (CLOCK_MONOTONIC, TimeSchemeKind::ClockGetres) => time::monotonic_resolution(), + _ => return Err(Error::new(EINVAL)), + }; + let time = TimeSpec { + tv_sec: (arch_time / time::NANOS_PER_SEC) as i64, + tv_nsec: (arch_time % time::NANOS_PER_SEC) as i32, + }; + current_chunk.copy_exactly(&time)?; + + bytes_read += size_of::(); + } + + Ok(bytes_read) + } + + fn kwrite( + &self, + id: usize, + buf: UserSliceRo, + _flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let handle = match HANDLES.read(token.token()).get(id)? { + Handle::Clock(handle) => handle.clone(), + Handle::SchemeRoot => return Err(Error::new(EBADF)), + }; + + let mut bytes_written = 0; + + for current_chunk in buf.in_exact_chunks(size_of::()) { + let time = unsafe { current_chunk.read_exact::()? }; + + match (handle.clock, handle.kind.clone()) { + (_, TimeSchemeKind::Default | TimeSchemeKind::Timer) => { + timeout::register( + GlobalSchemes::Time.scheme_id(), + id, + handle.clock, + time, + token, + ); + } + _ => return Err(Error::new(EINVAL)), + }; + + bytes_written += size_of::(); + } + + Ok(bytes_written) + } + fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result { + let handle = match HANDLES.read(token.token()).get(id)? { + Handle::Clock(handle) => handle.clone(), + Handle::SchemeRoot => return Err(Error::new(EBADF)), + }; + + let scheme_path = format!("/scheme/time/{}/{}", handle.clock, handle.kind).into_bytes(); + buf.copy_common_bytes_from_slice(&scheme_path) + } +} diff --git a/src/scheme/user.rs b/src/scheme/user.rs new file mode 100644 index 0000000000..b9013021e6 --- /dev/null +++ b/src/scheme/user.rs @@ -0,0 +1,2164 @@ +use alloc::{ + sync::{Arc, Weak}, + vec::Vec, +}; +use core::{ + mem::{self, size_of, ManuallyDrop}, + num::NonZeroUsize, +}; +use slab::Slab; +use syscall::{ + schemev2::{Cqe, CqeOpcode, Opcode, Sqe, SqeFlags}, + CallFlags, FmoveFdFlags, FobtainFdFlags, MunmapFlags, RecvFdFlags, SchemeSocketCall, + SendFdFlags, StdFsCallKind, MAP_FIXED_NOREPLACE, +}; + +use crate::{ + context::{ + self, + context::{bulk_add_fds, bulk_insert_fds, HardBlockedReason}, + file::{FileDescription, FileDescriptor, InternalFlags, LockedFileDescription}, + memory::{ + AddrSpace, AddrSpaceWrapper, BorrowedFmapSource, Grant, GrantFileRef, MmapMode, + PageSpan, DANGLING, + }, + BorrowedHtBuf, ContextLock, PreemptGuard, PreemptGuardL1, Status, + }, + event, + memory::{Frame, Page, VirtualAddress, PAGE_SIZE}, + scheme::SchemeId, + sync::{CleanLockToken, LockToken, Mutex, RwLock, WaitQueue, L1}, + syscall::{ + data::{Map, StdFsCallMeta}, + error::*, + flag::{EventFlags, MapFlags, EVENT_READ, O_NONBLOCK, PROT_READ}, + usercopy::{UserSlice, UserSliceRo, UserSliceRw, UserSliceWo}, + }, +}; + +use super::{CallerCtx, FileHandle, KernelScheme, OpenResult}; + +pub struct UserInner { + root_id: SchemeId, + pub scheme_id: SchemeId, + context: Weak, + todo: WaitQueue, + + // TODO: custom packed radix tree data structure + states: Mutex>, +} + +enum State { + Waiting { + context: Weak, + fds: Vec>, + callee_responsible: PageSpan, + canceling: bool, + }, + Responded(Response), + Fmap(Weak), + Placeholder, +} + +#[derive(Debug)] +enum Response { + Regular(Result, u8, bool), + Fd(Arc), + MultipleFds(Option>>), +} + +impl Response { + fn into_regular(self) -> Result { + match self { + Response::Regular(res, _, _) => res, + Response::Fd(_) | Response::MultipleFds(_) => Err(Error::new(EIO)), + } + } +} + +const ONE: NonZeroUsize = match NonZeroUsize::new(1) { + Some(one) => one, + None => unreachable!(), +}; + +enum ParsedCqe { + TriggerFevent { + number: usize, + flags: EventFlags, + }, + RegularResponse { + tag: u32, + res: Result, + extra0: u8, + }, + ResponseWithFd { + tag: u32, + fd: usize, + }, + ResponseWithMultipleFds { + tag: u32, + num_fds: usize, + }, + ObtainFd { + tag: u32, + flags: FobtainFdFlags, + dst_fd_or_ptr: usize, + }, + ProvideMmap { + tag: u32, + offset: u64, + base_addr: VirtualAddress, + page_count: usize, + }, + RespondAndNotifyOnDetach { + tag: u32, + res: Result, + extra0: u8, + }, +} +impl ParsedCqe { + fn parse_cqe(cqe: &Cqe) -> Result { + Ok( + match CqeOpcode::try_from_raw(cqe.flags & 0b111).ok_or(Error::new(EINVAL))? { + CqeOpcode::RespondRegular => Self::RegularResponse { + tag: cqe.tag, + res: Error::demux(cqe.result as usize), + extra0: cqe.extra_raw[0], + }, + CqeOpcode::RespondAndNotifyOnDetach => Self::RespondAndNotifyOnDetach { + tag: cqe.tag, + res: Error::demux(cqe.result as usize), + extra0: cqe.extra_raw[0], + }, + CqeOpcode::RespondWithFd => Self::ResponseWithFd { + tag: cqe.tag, + fd: cqe.result as usize, + }, + CqeOpcode::RespondWithMultipleFds => Self::ResponseWithMultipleFds { + tag: cqe.tag, + num_fds: cqe.result as usize, + }, + CqeOpcode::SendFevent => Self::TriggerFevent { + number: cqe.result as usize, + flags: EventFlags::from_bits(cqe.tag as usize).ok_or(Error::new(EINVAL))?, + }, + CqeOpcode::ObtainFd => Self::ObtainFd { + tag: cqe.tag, + flags: FobtainFdFlags::from_bits(cqe.extra() as usize) + .ok_or(Error::new(EINVAL))?, + dst_fd_or_ptr: cqe.result as usize, + }, + }, + ) + } +} + +impl UserInner { + pub fn new(root_id: SchemeId, scheme_id: SchemeId, context: Weak) -> UserInner { + UserInner { + root_id, + scheme_id, + context, + todo: WaitQueue::new(), + states: Mutex::new(Slab::with_capacity(32)), + } + } + + fn next_id(&self, token: &mut CleanLockToken) -> Result { + let idx = { + let mut states = self.states.lock(token.token()); + states.insert(State::Placeholder) + }; + + // TODO: implement blocking? + u32::try_from(idx).map_err(|_| Error::new(EAGAIN)) + } + + fn call( + &self, + ctx: CallerCtx, + fds: Vec>, + opcode: Opcode, + args: impl Args, + caller_responsible: &mut PageSpan, + token: &mut CleanLockToken, + ) -> Result { + self.call_inner( + fds, + Sqe { + opcode: opcode as u8, + sqe_flags: SqeFlags::empty(), + _rsvd: 0, + tag: self.next_id(token)?, + caller: ctx.pid as u64, + args: { + let mut a = args.args(); + a[5] = uid_gid_hack_merge([ctx.uid, ctx.gid]); + a + }, + }, + caller_responsible, + token, + ) + } + + fn call_inner( + &self, + fds: Vec>, + sqe: Sqe, + caller_responsible: &mut PageSpan, + token: &mut CleanLockToken, + ) -> Result { + { + // Disable preemption to avoid context switches between setting the + // process state and sending the scheme request. The process is made + // runnable again when the scheme response is received. Hence, we + // need to ensure that the following operations are atomic as + // otherwise the process will be blocked forever. + let current_context = context::current(); + let mut preempt = PreemptGuard::new(¤t_context, token); + let token = preempt.token(); + current_context + .write(token.token()) + .block("UserInner::call"); + { + let mut states = self.states.lock(token.token()); + states[sqe.tag as usize] = State::Waiting { + context: Arc::downgrade(¤t_context), + fds, + canceling: false, + + // This is the part that the scheme handler will deallocate when responding. It + // starts as empty, so the caller can unmap it (optimal for TLB), but is populated + // the caller is interrupted by SIGKILL. + callee_responsible: PageSpan::empty(), + }; + } + self.todo.send(sqe, token); + + event::trigger(self.root_id, self.scheme_id.get(), EVENT_READ, token); + } + + loop { + context::switch(token); + + { + let mut eintr_if_sigkill = + |callee_responsible: &mut PageSpan, token: &mut LockToken| { + // If SIGKILL was found without waiting for scheme, EINTR directly. In that + // case, data loss doesn't matter. + if context::current().read(token.token()).being_sigkilled { + // Callee must deallocate memory, rather than the caller. This is less optimal + // for TLB, but we don't really have any other choice. The scheme must be able + // to access the borrowed memory until it has responded to the request. + *callee_responsible = + mem::replace(caller_responsible, PageSpan::empty()); + + Err(Error::new(EINTR)) + } else { + Ok(()) + } + }; + + let states = self.states.lock(token.token()); + let (mut states, mut token) = states.into_split(); + match states.get_mut(sqe.tag as usize) { + // invalid state + None => return Err(Error::new(EBADFD)), + Some(o) => match mem::replace(o, State::Placeholder) { + // signal wakeup while awaiting cancelation + State::Waiting { + canceling: true, + mut callee_responsible, + context, + fds, + } => { + let maybe_eintr = + eintr_if_sigkill(&mut callee_responsible, &mut token.token()); + *o = State::Waiting { + canceling: true, + callee_responsible, + context, + fds, + }; + + maybe_eintr?; + + context::current() + .write(token.token()) + .block("UserInner::call (woken up after cancelation request)"); + + // We do not want to drop the lock before blocking + // as if we get preempted in between we might miss a + // wakeup. + drop(states); + } + // spurious wakeup + State::Waiting { + canceling: false, + fds, + context, + mut callee_responsible, + } => { + let maybe_eintr = eintr_if_sigkill(&mut callee_responsible, &mut token); + let current_context = context::current(); + + *o = State::Waiting { + // Currently we treat all spurious wakeups to have the same behavior + // as signals (i.e., we send a cancellation request). It is not something + // that should happen, but it certainly can happen, for example if a context + // is awoken through its thread handle without setting any sig bits, or if the + // caller clears its own sig bits. If it actually is a signal, then it is the + // intended behavior. + canceling: true, + fds, + context, + callee_responsible, + }; + + maybe_eintr?; + + // We do not want to preempt between sending the + // cancellation and blocking again where we might + // miss a wakeup. + let mut preempt = PreemptGuardL1::new(¤t_context, &mut token); + let token = preempt.token(); + + self.todo.send_locked( + Sqe { + opcode: Opcode::Cancel as u8, + sqe_flags: SqeFlags::ONEWAY, + tag: sqe.tag, + ..Default::default() + }, + token.token(), + ); + event::trigger_locked( + self.root_id, + self.scheme_id.get(), + EVENT_READ, + token.token(), + ); + + // 1. If cancellation was requested and arrived + // before the scheme processed the request, an + // acknowledgement will be sent back after the + // cancellation is processed and we will be woken up + // again. State will be State::Responded then. + // + // 2. If cancellation was requested but the scheme + // already processed the request, we will receive + // the actual response next and woken up again. + // State will be State::Responded then. + context::current() + .write(token.token()) + .block("UserInner::call (spurious wakeup)"); + drop(states); + } + + // invalid state + old_state @ (State::Placeholder | State::Fmap(_)) => { + *o = old_state; + return Err(Error::new(EBADFD)); + } + + State::Responded(response) => { + states.remove(sqe.tag as usize); + return Ok(response); + } + }, + } + } + } + } + + /// Map a readable structure to the scheme's userspace and return the + /// pointer + #[must_use = "copying back to head/tail buffers can fail"] + fn capture_user( + &self, + buf: UserSlice, + token: &mut CleanLockToken, + ) -> Result> { + UserInner::capture_inner(&self.context, buf, token) + } + fn copy_and_capture_tail( + &self, + buf: &[u8], + token: &mut CleanLockToken, + ) -> Result> { + let dst_addr_space = { + Arc::clone( + self.context + .upgrade() + .ok_or(Error::new(ENODEV))? + .read(token.token()) + .addr_space()?, + ) + }; + + let mut tail = BorrowedHtBuf::tail_locked(token.downgrade())?; + let tail_frame = tail.frame(); + if buf.len() > tail.buf().len() { + return Err(Error::new(EINVAL)); + } + tail.buf_mut()[..buf.len()].copy_from_slice(buf); + + let is_pinned = true; + let dst_page = { + let mut lock_token = token.token(); + + dst_addr_space + .acquire_write(lock_token.downgrade()) + .mmap_anywhere( + &dst_addr_space, + ONE, + PROT_READ, + |dst_page, flags, mapper, flusher| { + Grant::allocated_shared_one_page( + tail_frame, dst_page, flags, mapper, flusher, is_pinned, + ) + }, + )? + }; + + let base = dst_page.start_address().data(); + let len = buf.len(); + + Ok(CaptureGuard { + base, + len, + destroyed: false, + head: CopyInfo { + src: Some(tail), + dst: None, + }, + tail: CopyInfo { + src: None, + dst: None, + }, + span: { + let (first_page, page_count, _offset) = page_range_containing(base, len); + PageSpan::new(first_page, page_count) + }, + addrsp: Some(dst_addr_space), + }) + } + + // TODO: Use an address space Arc over a context Arc. While contexts which share address spaces + // still can access borrowed scheme pages, it would both be cleaner and would handle the case + // where the initial context is closed. + /// Capture a buffer owned by userspace, mapping it contiguously onto scheme memory. + // TODO: Hypothetical accept_head_leak, accept_tail_leak options might be useful for + // libc-controlled buffer pools. + fn capture_inner( + context_weak: &Weak, + user_buf: UserSlice, + token: &mut CleanLockToken, + ) -> Result> { + let mut map_flags = MapFlags::empty(); + map_flags.set(MapFlags::PROT_READ, READ); + map_flags.set(MapFlags::PROT_WRITE, WRITE); + + if user_buf.is_empty() { + // NOTE: Rather than returning NULL, we return a dummy dangling address, which + // happens to be non-canonical on x86. This relieves scheme handlers from having to + // check the length before e.g. creating nonnull Rust references (when an empty length + // still requires a nonnull but possibly dangling pointer, and this has in practice + // made nulld erroneously confuse an empty Some("") with None (invalid UTF-8), due to + // enum layout optimization, as the pointer was null and not dangling). A good choice + // is thus to simply set the most-significant bit to be compatible with all alignments. + return Ok(CaptureGuard { + destroyed: false, + base: DANGLING, + len: 0, + head: CopyInfo { + src: None, + dst: None, + }, + tail: CopyInfo { + src: None, + dst: None, + }, + span: PageSpan::empty(), + addrsp: None, + }); + } + + let cur_space_lock = AddrSpace::current()?; + let dst_space_lock = { + match context_weak.upgrade() { + Some(ctx) => { + if context::is_current(&ctx) { + // Will bail below this code + Arc::clone(&cur_space_lock) + } else { + Arc::clone(ctx.read(token.token()).addr_space()?) + } + } + None => return Err(Error::new(ESRCH)), + } + }; + + if Arc::ptr_eq(&dst_space_lock, &cur_space_lock) { + // Same address space, no need to remap anything! + return Ok(CaptureGuard { + destroyed: false, + base: user_buf.addr(), + len: user_buf.len(), + head: CopyInfo { + src: None, + dst: None, + }, + tail: CopyInfo { + src: None, + dst: None, + }, + span: PageSpan::empty(), + addrsp: Some(dst_space_lock), + }); + } + + let (src_page, page_count, offset) = page_range_containing(user_buf.addr(), user_buf.len()); + + let align_offset = if offset == 0 { 0 } else { PAGE_SIZE - offset }; + let (head_part_of_buf, middle_tail_part_of_buf) = user_buf + .split_at(core::cmp::min(align_offset, user_buf.len())) + .expect("split must succeed"); + + let middle_page_count = middle_tail_part_of_buf.len() / PAGE_SIZE; + let tail_size = middle_tail_part_of_buf.len() % PAGE_SIZE; + + let (_middle_part_of_buf, tail_part_of_buf) = middle_tail_part_of_buf + .split_at(middle_page_count * PAGE_SIZE) + .expect("split must succeed"); + + let head_len = core::cmp::min(PAGE_SIZE - offset, user_buf.len()); + + let head_buf_opt = if !head_part_of_buf.is_empty() { + // FIXME: Signal context can probably recursively use head/tail. + let mut array = BorrowedHtBuf::head_locked(token.downgrade())?; + if READ { + array.buf_mut()[..offset].fill(0_u8); + array.buf_mut()[offset + head_len..].fill(0_u8); + let slice = &mut array.buf_mut()[offset..][..head_len]; + + head_part_of_buf + .reinterpret_unchecked::() + .copy_to_slice(slice)?; + } else { + array.buf_mut().fill(0_u8); + } + Some(array) + } else { + None + }; + + let tail_buf_opt = if !tail_part_of_buf.is_empty() { + // FIXME: Signal context can probably recursively use head/tail. + let mut array = BorrowedHtBuf::tail_locked(token.downgrade())?; + + if READ { + let (to_copy, to_zero) = array.buf_mut().split_at_mut(tail_size); + to_zero.fill(0_u8); + + // FIXME: remove reinterpret_unchecked + tail_part_of_buf + .reinterpret_unchecked::() + .copy_to_slice(to_copy)?; + } else { + array.buf_mut().fill(0_u8); + } + Some(array) + } else { + None + }; + + let mut dst_space_guard = dst_space_lock.acquire_write(token.downgrade()); + let (dst_space, _token_split) = dst_space_guard.token_split(); + + let free_span = dst_space + .grants + .find_free(dst_space.mmap_min, page_count) + .ok_or(Error::new(ENOMEM))?; + + let head = if let Some(array) = head_buf_opt { + let frame = array.frame(); + dst_space.mmap( + &dst_space_lock, + Some(free_span.base), + ONE, + map_flags | MAP_FIXED_NOREPLACE, + None, + move |dst_page, page_flags, mapper, flusher| { + let is_pinned = true; + Grant::allocated_shared_one_page( + frame, dst_page, page_flags, mapper, flusher, is_pinned, + ) + }, + )?; + + CopyInfo { + src: Some(array), + dst: WRITE.then_some(head_part_of_buf.reinterpret_unchecked()), + } + } else { + CopyInfo { + src: None, + dst: None, + } + }; + let (first_middle_dst_page, first_middle_src_page) = if !head_part_of_buf.is_empty() { + (free_span.base.next(), src_page.next()) + } else { + (free_span.base, src_page) + }; + + if let Some(middle_page_count) = NonZeroUsize::new(middle_page_count) { + dst_space.mmap( + &dst_space_lock, + Some(first_middle_dst_page), + middle_page_count, + map_flags | MAP_FIXED_NOREPLACE, + None, + move |dst_page, _, mapper, flusher| { + let eager = true; + + // It doesn't make sense to allow a context, that has borrowed non-RAM physical + // memory, to DIRECTLY do scheme calls onto that memory. + // + // (TODO: Maybe there are some niche use cases for that, possibly PCI transfer + // BARs, but it doesn't make sense yet.) + let allow_phys = false; + + // Deny any attempts by the scheme, to unmap these temporary pages. The only way to + // unmap them is to respond to the scheme socket. + let is_pinned_userscheme_borrow = true; + + // TODO: Not a Lock ordering violation + // we've checked Arc::ptr_eq(&dst_space_lock, &cur_space_lock) before, + // but it's difficult to apply cur_space_lock.arquire_rewrite + let mut token = unsafe { CleanLockToken::new() }; + let mut cur_space_guard = + unsafe { cur_space_lock.acquire_rewrite(token.downgrade()) }; + Grant::borrow( + Arc::clone(&cur_space_lock), + &mut cur_space_guard, + first_middle_src_page, + dst_page, + middle_page_count.get(), + map_flags, + mapper, + flusher, + eager, + allow_phys, + is_pinned_userscheme_borrow, + ) + }, + )?; + } + + let tail = if let Some(array) = tail_buf_opt { + let tail_dst_page = first_middle_dst_page.next_by(middle_page_count); + let frame = array.frame(); + + dst_space.mmap( + &dst_space_lock, + Some(tail_dst_page), + ONE, + map_flags | MAP_FIXED_NOREPLACE, + None, + move |dst_page, page_flags, mapper, flusher| { + let is_pinned = true; + Grant::allocated_shared_one_page( + frame, dst_page, page_flags, mapper, flusher, is_pinned, + ) + }, + )?; + + CopyInfo { + src: Some(array), + dst: WRITE.then_some(tail_part_of_buf.reinterpret_unchecked()), + } + } else { + CopyInfo { + src: None, + dst: None, + } + }; + + drop(dst_space_guard); + + let base = free_span.base.start_address().data() + offset; + Ok(CaptureGuard { + destroyed: false, + base, + len: user_buf.len(), + head, + tail, + span: { + let (first_page, page_count, _offset) = page_range_containing(base, user_buf.len()); + PageSpan::new(first_page, page_count) + }, + addrsp: Some(dst_space_lock), + }) + } + + pub fn read(&self, buf: UserSliceWo, flags: u32, token: &mut CleanLockToken) -> Result { + // If O_NONBLOCK is used, do not block + let nonblock = flags & O_NONBLOCK as u32 != 0; + + match self + .todo + .receive_into_user(buf, !nonblock, "UserInner::read (v2)", token) + { + // If we received requests, return them to the scheme handler + Ok(byte_count) => Ok(byte_count), + // If there were no requests and O_NONBLOCK was used (EAGAIN), or some other error + // occurred, return that. + Err(error) => Err(error), + } + } + + pub fn write(&self, buf: UserSliceRo, token: &mut CleanLockToken) -> Result { + let mut bytes_read = 0; + for chunk in buf.in_exact_chunks(size_of::()) { + match ParsedCqe::parse_cqe(&unsafe { chunk.read_exact::()? }) + .and_then(|p| self.handle_parsed(&p, token)) + { + Ok(()) => bytes_read += size_of::(), + Err(_) if bytes_read > 0 => break, + Err(error) => return Err(error), + } + } + Ok(bytes_read) + } + pub fn request_fmap( + &self, + id: usize, + _offset: u64, + required_page_count: usize, + flags: MapFlags, + token: &mut CleanLockToken, + ) -> Result<()> { + info!("REQUEST FMAP"); + + let tag = self.next_id(token)?; + { + let mut states = self.states.lock(token.token()); + states[tag as usize] = State::Fmap(Arc::downgrade(&context::current())); + } + + self.todo.send( + Sqe { + opcode: Opcode::RequestMmap as u8, + sqe_flags: SqeFlags::empty(), + _rsvd: 0, + tag, + args: [ + id as u64, + flags.bits() as u64, + required_page_count as u64, + 0, + 0, + uid_gid_hack_merge(current_uid_gid(token)), + ], + caller: { context::current().read(token.token()).pid as u64 }, + }, + token, + ); + event::trigger(self.root_id, self.scheme_id.get(), EVENT_READ, token); + + Ok(()) + } + fn handle_parsed(&self, cqe: &ParsedCqe, token: &mut CleanLockToken) -> Result<()> { + match *cqe { + ParsedCqe::RegularResponse { tag, res, extra0 } => { + self.respond(tag, Response::Regular(res, extra0, false), token)? + } + ParsedCqe::RespondAndNotifyOnDetach { tag, res, extra0 } => { + self.respond(tag, Response::Regular(res, extra0, true), token)? + } + ParsedCqe::ResponseWithFd { tag, fd } => self.respond( + tag, + Response::Fd({ + { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + context.remove_file(FileHandle::from(fd), &mut token) + } + .ok_or(Error::new(EINVAL))? + .description + }), + token, + )?, + ParsedCqe::ResponseWithMultipleFds { tag, num_fds: _ } => { + self.respond(tag, Response::MultipleFds(None), token)?; + } + ParsedCqe::ObtainFd { + tag, + flags, + dst_fd_or_ptr, + } => { + let description = { + match self + .states + .lock(token.token()) + .get_mut(tag as usize) + .ok_or(Error::new(EINVAL))? + { + &mut State::Waiting { ref mut fds, .. } => { + if fds.is_empty() { + return Err(Error::new(ENOENT)); + } + fds.remove(0) + } + _ => return Err(Error::new(ENOENT)), + } + }; + + // FIXME: Description can leak if there is no additional file table space. + if flags.contains(FobtainFdFlags::MANUAL_FD) { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + context.insert_file( + FileHandle::from(dst_fd_or_ptr), + FileDescriptor { + description, + cloexec: true, + }, + &mut token, + ); + } else { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + let fd = context + .add_file( + FileDescriptor { + description, + cloexec: true, + }, + &mut token, + ) + .ok_or(Error::new(EMFILE))?; + UserSlice::wo(dst_fd_or_ptr, size_of::())?.write_usize(fd.get())?; + } + } + ParsedCqe::ProvideMmap { + tag, + offset, + base_addr, + page_count, + } => { + info!( + "PROVIDE_MAP {:x} {:x} {:?} {:x}", + tag, offset, base_addr, page_count + ); + + if offset % PAGE_SIZE as u64 != 0 { + return Err(Error::new(EINVAL)); + } + + if base_addr.data() % PAGE_SIZE != 0 { + return Err(Error::new(EINVAL)); + } + + if page_count != 1 { + return Err(Error::new(EINVAL)); + } + + let context = { + let mut states = self.states.lock(token.token()); + match states.get_mut(tag as usize) { + Some(o) => match mem::replace(o, State::Placeholder) { + // invalid state + State::Placeholder => { + return Err(Error::new(EBADFD)); + } + // invalid kernel to scheme call + old_state @ (State::Waiting { .. } | State::Responded(_)) => { + *o = old_state; + return Err(Error::new(EINVAL)); + } + State::Fmap(context) => { + states.remove(tag as usize); + context + } + }, + None => return Err(Error::new(EINVAL)), + } + }; + + let context = context.upgrade().ok_or(Error::new(ESRCH))?; + + let mut lock_token = token.token(); + let (frame, _) = AddrSpace::current()? + .acquire_read(lock_token.downgrade()) + .table + .utable + .translate(base_addr) + .ok_or(Error::new(EFAULT))?; + + { + let mut context = context.write(token.token()); + if let Status::HardBlocked { + reason: HardBlockedReason::AwaitingMmap { .. }, + } = context.status + { + context.status = Status::Runnable + } + context.fmap_ret = Some(Frame::containing(frame)); + } + } + ParsedCqe::TriggerFevent { number, flags } => { + event::trigger(self.scheme_id, number, flags, token) + } + } + Ok(()) + } + fn respond(&self, tag: u32, mut response: Response, token: &mut CleanLockToken) -> Result<()> { + let to_close: Vec; + + { + let mut states_lock = self.states.lock(token.token()); + let (states, mut lock_token) = states_lock.token_split(); + match states.get_mut(tag as usize) { + Some(o) => match mem::replace(o, State::Placeholder) { + // invalid state + State::Placeholder => return Err(Error::new(EBADFD)), + // invalid scheme to kernel call + old_state @ (State::Responded(_) | State::Fmap(_)) => { + *o = old_state; + return Err(Error::new(EINVAL)); + } + + State::Waiting { + context, + fds, + canceling, + callee_responsible, + } => { + // Convert ECANCELED to EINTR if a request was being canceled (currently always + // due to signals). + if let Response::Regular(ref mut res, _, _) = response + && canceling + && *res == Err(Error::new(ECANCELED)) + { + *res = Err(Error::new(EINTR)); + } + + // TODO: Require ECANCELED? + if let Response::Regular(ref mut res, _, _) = response + && !canceling + && *res == Err(Error::new(EINTR)) + { + // EINTR is valid after cancelation has been requested, but not otherwise. + // This is because the userspace signal trampoline will be invoked after a + // syscall returns EINTR. + *res = Err(Error::new(EIO)); + } + + if let Response::MultipleFds(ref mut response_fds) = response { + *response_fds = Some(fds); + to_close = Vec::new(); + } else { + to_close = fds + .into_iter() + .filter_map(|f| Arc::try_unwrap(f).ok()) + .map(RwLock::into_inner) + .collect(); + } + + match context.upgrade() { + Some(context) => { + *o = State::Responded(response); + context.write(lock_token.token()).unblock(); + } + _ => { + states.remove(tag as usize); + } + } + + drop(states_lock); + + let unpin = true; + let res = AddrSpace::current()?.munmap(callee_responsible, unpin, token)?; + for r in res { + let _ = r.unmap(token); + } + } + }, + // invalid state + None => return Err(Error::new(EBADFD)), + } + } + + for fd in to_close { + let _ = fd.try_close(token); + } + Ok(()) + } + + pub fn fevent(&self, flags: EventFlags, token: &mut CleanLockToken) -> Result { + // TODO: Should the root scheme also suppress events if `flags` does not contain + // `EVENT_READ`? + Ok(if self.todo.is_currently_empty(token) { + EventFlags::empty() + } else { + EventFlags::EVENT_READ.intersection(flags) + }) + } + + pub fn fsync(&self) -> Result<()> { + Ok(()) + } + + fn fmap_inner( + &self, + dst_addr_space: Arc, + file: usize, + map: &Map, + token: &mut CleanLockToken, + ) -> Result { + let unaligned_size = map.size; + + if unaligned_size == 0 { + return Err(Error::new(EINVAL)); + } + + let page_count = unaligned_size.div_ceil(PAGE_SIZE); + + if !map.address.is_multiple_of(PAGE_SIZE) { + return Err(Error::new(EINVAL)); + }; + + let fixed = map.flags.contains(MapFlags::MAP_FIXED) + || map.flags.contains(MapFlags::MAP_FIXED_NOREPLACE); + let dst_base = (map.address != 0 || fixed) + .then_some(Page::containing_address(VirtualAddress::new(map.address))); + + if !map.offset.is_multiple_of(PAGE_SIZE) { + return Err(Error::new(EINVAL)); + } + + let src_address_space = { + Arc::clone( + self.context + .upgrade() + .ok_or(Error::new(ENODEV))? + .read(token.token()) + .addr_space()?, + ) + }; + if Arc::ptr_eq(&src_address_space, &dst_addr_space) { + return Err(Error::new(EBUSY)); + } + + let (ctx, desc) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + let mut files = context.files.read(token.token()); + let (files, mut token) = files.token_split(); + let desc = files.find_by_scheme(self.scheme_id, file, &mut token)?; + (context.caller_ctx(), desc.description) + }; + + let response = self.call( + ctx, + Vec::new(), + Opcode::MmapPrep, + [ + file as u64, + unaligned_size as u64, + map.flags.bits() as u64, + map.offset as u64, + ], + &mut PageSpan::empty(), + token, + )?; + + // TODO: I've previously tested that this works, but because the scheme trait all of + // Redox's schemes currently rely on doesn't allow one-way messages, there's no current + // code using it. + + //let mapping_is_lazy = map.flags.contains(MapFlags::MAP_LAZY); + let mapping_is_lazy = false; + + let base_page_opt = (!mapping_is_lazy).then_some(response.into_regular()?); + + let file_ref = GrantFileRef { + description: desc, + base_offset: map.offset, + }; + + let mut lock_token = token.token(); + let src = match base_page_opt { + Some(base_addr) => Some({ + if base_addr % PAGE_SIZE != 0 { + return Err(Error::new(EINVAL)); + } + let addr_space_lock = &src_address_space; + BorrowedFmapSource { + src_base: Page::containing_address(VirtualAddress::new(base_addr)), + addr_space_lock, + addr_space_guard: addr_space_lock.acquire_write(lock_token.downgrade()), + mode: if map.flags.contains(MapFlags::MAP_SHARED) { + MmapMode::Shared + } else { + MmapMode::Cow + }, + } + }), + None => None, + }; + + let page_count_nz = NonZeroUsize::new(page_count).expect("already validated map.size != 0"); + let mut notify_files = Vec::new(); + // TODO: Not a Lock ordering violation + // we've checked Arc::ptr_eq(&src_address_space, &dst_addr_space) before, + // but it's difficult to apply src.arquire_rewrite + let mut clean_token = unsafe { CleanLockToken::new() }; + let dst_base = { + dst_addr_space.acquire_write(clean_token.downgrade()).mmap( + &dst_addr_space, + dst_base, + page_count_nz, + map.flags, + Some(&mut notify_files), + |dst_base, flags, mapper, flusher| { + Grant::borrow_fmap( + PageSpan::new(dst_base, page_count), + flags, + file_ref, + src, + &dst_addr_space, + mapper, + flusher, + ) + }, + )? + }; + + for map in notify_files { + let _ = map.unmap(token); + } + + Ok(dst_base.start_address().data()) + } + + pub fn call_fdwrite( + &self, + descs: Vec>, + flags: CallFlags, + _arg: u64, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + if metadata.is_empty() { + return Err(Error::new(EINVAL)); + } + let Some(verb) = SchemeSocketCall::try_from_raw(metadata[0] as usize) else { + return Err(Error::new(EINVAL)); + }; + + match verb { + SchemeSocketCall::MoveFd => { + if metadata.len() != 2 { + return Err(Error::new(EINVAL)); + } + let mut movefd_flags = FmoveFdFlags::empty(); + if flags.contains(CallFlags::FD_EXCLUSIVE) { + movefd_flags |= FmoveFdFlags::EXCLUSIVE; + } + if flags.contains(CallFlags::FD_CLONE) { + movefd_flags |= FmoveFdFlags::CLONE; + } + self.handle_movefd(descs, metadata[1] as usize, movefd_flags, token) + } + _ => Err(Error::new(EINVAL)), + } + } + + fn handle_movefd( + &self, + descs: Vec>, + request_id: usize, + _flags: FmoveFdFlags, + token: &mut CleanLockToken, + ) -> Result { + let num_fds = descs.len(); + match self + .states + .lock(token.token()) + .get_mut(request_id) + .ok_or(Error::new(EINVAL))? + { + &mut State::Waiting { ref mut fds, .. } => *fds = descs, + _ => return Err(Error::new(ENOENT)), + }; + + Ok(num_fds) + } + + pub fn call_fdread( + &self, + payload: UserSliceRw, + flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + if metadata.is_empty() { + return Err(Error::new(EINVAL)); + } + debug!( + "call_fdread: payload: {} metadata: {}", + payload.len(), + metadata.len() + ); + + let Some(verb) = SchemeSocketCall::try_from_raw(metadata[0] as usize) else { + return Err(Error::new(EINVAL)); + }; + + match verb { + SchemeSocketCall::ObtainFd => { + if metadata.len() != 2 { + return Err(Error::new(EINVAL)); + } + let mut obtainfd_flags = FobtainFdFlags::empty(); + if flags.contains(CallFlags::FD_UPPER) { + obtainfd_flags |= FobtainFdFlags::UPPER_TBL; + } + if flags.contains(CallFlags::FD_EXCLUSIVE) { + obtainfd_flags |= FobtainFdFlags::EXCLUSIVE; + } + if flags.contains(CallFlags::FD_CLOEXEC) { + obtainfd_flags |= FobtainFdFlags::CLOEXEC; + } + self.handle_obtainfd(payload, metadata[1] as usize, obtainfd_flags, token) + } + _ => Err(Error::new(EINVAL)), + } + } + + fn handle_obtainfd( + &self, + payload: UserSliceRw, + request_id: usize, + flags: FobtainFdFlags, + token: &mut CleanLockToken, + ) -> Result { + let descriptions = match self + .states + .lock(token.token()) + .get_mut(request_id) + .ok_or(Error::new(EINVAL))? + { + &mut State::Waiting { ref mut fds, .. } => mem::take(fds), + _ => return Err(Error::new(ENOENT)), + }; + + let mut token = token.downgrade(); + let num_fds = if flags.contains(FobtainFdFlags::UPPER_TBL) { + bulk_insert_fds( + descriptions, + payload, + flags.contains(FobtainFdFlags::CLOEXEC), + &mut token.token(), + )? + } else { + bulk_add_fds( + descriptions, + payload, + flags.contains(FobtainFdFlags::CLOEXEC), + &mut token.token(), + )? + }; + + Ok(num_fds) + } + + pub fn into_drop(self, token: &mut CleanLockToken) { + self.todo.condition.into_drop(token); + } +} +pub struct CaptureGuard { + destroyed: bool, + base: usize, + len: usize, + span: PageSpan, + + head: CopyInfo, + tail: CopyInfo, + addrsp: Option>, +} +impl CaptureGuard { + fn base(&self) -> usize { + self.base + } + fn len(&self) -> usize { + self.len + } + fn span(&mut self) -> &mut PageSpan { + &mut self.span + } +} +struct CopyInfo { + src: Option, + + // TODO + dst: Option>, +} +impl CaptureGuard { + fn release_inner(&mut self, token: &mut CleanLockToken) -> Result<()> { + if self.destroyed { + return Ok(()); + } + self.destroyed = true; + + if self.base == DANGLING { + return Ok(()); + } + + // TODO: Encode src and dst better using const generics. + if let CopyInfo { + src: Some(ref src), + dst: Some(ref mut dst), + } = self.head + { + dst.copy_from_slice(&src.buf()[self.base % PAGE_SIZE..][..dst.len()])?; + } + if let CopyInfo { + src: Some(ref src), + dst: Some(ref mut dst), + } = self.tail + { + dst.copy_from_slice(&src.buf()[..dst.len()])?; + } + let unpin = true; + if let Some(ref addrsp) = self.addrsp + && !self.span.is_empty() + { + let res = addrsp.munmap(self.span, unpin, token)?; + for r in res { + let _ = r.unmap(token); + } + } + + Ok(()) + } + pub fn release(mut self, token: &mut CleanLockToken) -> Result<()> { + self.release_inner(token)?; + if let Some(addrsp) = self.addrsp.take() + && let Some(addrsp) = Arc::into_inner(addrsp) + { + addrsp.into_drop(token); + } + if let Some(src) = self.head.src.take() { + src.into_drop(token); + } + if let Some(src) = self.tail.src.take() { + src.into_drop(token); + } + let _ = ManuallyDrop::new(self); + Ok(()) + } +} +impl Drop for CaptureGuard { + fn drop(&mut self) { + let mut token = unsafe { CleanLockToken::new() }; + let _ = self.release_inner(&mut token); + #[cfg(feature = "drop_panic")] + { + panic!("CaptureGuard dropped"); + } + } +} +/// base..base+size => page..page+page_count*PAGE_SIZE, offset +fn page_range_containing(base: usize, size: usize) -> (Page, usize, usize) { + let first_page = Page::containing_address(VirtualAddress::new(base)); + let offset = base - first_page.start_address().data(); + + (first_page, (size + offset).div_ceil(PAGE_SIZE), offset) +} + +/// `UserInner` has to be wrapped +#[derive(Clone)] +pub struct UserScheme { + pub(crate) inner: Arc, +} + +impl UserScheme { + pub fn new(inner: Arc) -> UserScheme { + UserScheme { inner } + } +} + +impl KernelScheme for UserScheme { + fn kopenat( + &self, + file: usize, + path: super::StrOrBytes, + flags: usize, + fcntl_flags: u32, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + let mut address = self.inner.copy_and_capture_tail(path.as_bytes(), token)?; + let result = self.inner.call( + ctx, + Vec::new(), + Opcode::OpenAt, + [file, address.base(), address.len(), flags, fcntl_flags as _], + address.span(), + token, + ); + + address.release(token)?; + + match result? { + Response::Regular(res, fl, _) => Ok({ + let fd = res?; + OpenResult::SchemeLocal( + fd, + InternalFlags::from_extra0(fl).ok_or(Error::new(EINVAL))?, + ) + }), + Response::Fd(desc) => Ok(OpenResult::External(desc)), + Response::MultipleFds(_) => Err(Error::new(EIO)), + } + } + + fn unlinkat( + &self, + file: usize, + path: &str, + flags: usize, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result<()> { + let mut address = self.inner.copy_and_capture_tail(path.as_bytes(), token)?; + match self.inner.call( + ctx, + Vec::new(), + Opcode::UnlinkAt, + [file, address.base(), address.len(), flags], + address.span(), + token, + ) { + Ok(res) => { + address.release(token)?; + res.into_regular() + } + Err(e) => { + let _ = address.release(token); + Err(e) + } + }?; + Ok(()) + } + + fn fsize(&self, file: usize, token: &mut CleanLockToken) -> Result { + let ctx = { context::current().read(token.token()).caller_ctx() }; + self.inner + .call( + ctx, + Vec::new(), + Opcode::Fsize, + [file], + &mut PageSpan::empty(), + token, + )? + .into_regular() + .map(|o| o as u64) + } + + fn fchmod(&self, file: usize, mode: u16, token: &mut CleanLockToken) -> Result<()> { + let ctx = { context::current().read(token.token()).caller_ctx() }; + self.inner + .call( + ctx, + Vec::new(), + Opcode::Fchmod, + [file, mode as usize], + &mut PageSpan::empty(), + token, + )? + .into_regular()?; + Ok(()) + } + + fn fchown(&self, file: usize, uid: u32, gid: u32, token: &mut CleanLockToken) -> Result<()> { + { + let ctx = context::current(); + let cx = &ctx.read(token.token()); + if cx.euid != 0 && (uid != cx.euid || gid != cx.egid) { + return Err(Error::new(EPERM)); + } + } + + let ctx = { context::current().read(token.token()).caller_ctx() }; + self.inner + .call( + ctx, + Vec::new(), + Opcode::Fchown, + [file, uid as usize, gid as usize], + &mut PageSpan::empty(), + token, + )? + .into_regular()?; + Ok(()) + } + + fn fcntl( + &self, + file: usize, + cmd: usize, + arg: usize, + token: &mut CleanLockToken, + ) -> Result { + let ctx = { context::current().read(token.token()).caller_ctx() }; + self.inner + .call( + ctx, + Vec::new(), + Opcode::Fcntl, + [file, cmd, arg], + &mut PageSpan::empty(), + token, + )? + .into_regular() + } + + fn fevent( + &self, + file: usize, + flags: EventFlags, + token: &mut CleanLockToken, + ) -> Result { + let ctx = { context::current().read(token.token()).caller_ctx() }; + self.inner + .call( + ctx, + Vec::new(), + Opcode::Fevent, + [file, flags.bits()], + &mut PageSpan::empty(), + token, + )? + .into_regular() + .map(EventFlags::from_bits_truncate) + } + + fn flink( + &self, + file: usize, + path: &str, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result<()> { + let mut address = self.inner.copy_and_capture_tail(path.as_bytes(), token)?; + match self.inner.call( + ctx, + Vec::new(), + Opcode::Flink, + [file, address.base(), address.len()], + address.span(), + token, + ) { + Ok(res) => { + address.release(token)?; + res.into_regular() + } + Err(err) => { + let _ = address.release(token); + Err(err) + } + }?; + Ok(()) + } + + fn frename( + &self, + file: usize, + path: &str, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result<()> { + let mut address = self.inner.copy_and_capture_tail(path.as_bytes(), token)?; + match self.inner.call( + ctx, + Vec::new(), + Opcode::Frename, + [file, address.base(), address.len()], + address.span(), + token, + ) { + Ok(res) => { + address.release(token)?; + res.into_regular() + } + Err(err) => { + let _ = address.release(token); + Err(err) + } + }?; + Ok(()) + } + + fn fsync(&self, file: usize, token: &mut CleanLockToken) -> Result<()> { + let ctx = { context::current().read(token.token()).caller_ctx() }; + self.inner + .call( + ctx, + Vec::new(), + Opcode::Fsync, + [file], + &mut PageSpan::empty(), + token, + )? + .into_regular()?; + Ok(()) + } + + fn ftruncate(&self, file: usize, len: usize, token: &mut CleanLockToken) -> Result<()> { + let ctx = { context::current().read(token.token()).caller_ctx() }; + self.inner + .call( + ctx, + Vec::new(), + Opcode::Ftruncate, + [file, len], + &mut PageSpan::empty(), + token, + )? + .into_regular()?; + Ok(()) + } + + fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + self.inner.todo.send( + Sqe { + opcode: Opcode::CloseMsg as u8, + sqe_flags: SqeFlags::empty(), + _rsvd: 0, + tag: 0, + args: [id as u64, 0, 0, 0, 0, 0], + caller: 0, // TODO? + }, + token, + ); + + event::trigger( + self.inner.root_id, + self.inner.scheme_id.get(), + EVENT_READ, + token, + ); + + Ok(()) + } + + fn detach(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + let ctx = { context::current().read(token.token()).caller_ctx() }; + self.inner.todo.send( + Sqe { + opcode: Opcode::Detach as u8, + sqe_flags: SqeFlags::empty(), + _rsvd: 0, + tag: 0, + args: [id as u64, 0, 0, 0, 0, 0], + caller: ctx.pid as u64, + }, + token, + ); + event::trigger( + self.inner.root_id, + self.inner.scheme_id.get(), + EVENT_READ, + token, + ); + Ok(()) + } + + fn kdup( + &self, + file: usize, + buf: UserSliceRo, + ctx: CallerCtx, + token: &mut CleanLockToken, + ) -> Result { + let inner = self.inner.clone(); + let mut address = inner.capture_user(buf, token)?; + let result = inner.call( + ctx, + Vec::new(), + Opcode::Dup, + [file, address.base(), address.len()], + address.span(), + token, + ); + + address.release(token)?; + + match result? { + Response::Regular(res, fl, _) => Ok({ + let fd = res?; + OpenResult::SchemeLocal( + fd, + InternalFlags::from_extra0(fl).ok_or(Error::new(EINVAL))?, + ) + }), + Response::Fd(desc) => Ok(OpenResult::External(desc)), + Response::MultipleFds(_) => Err(Error::new(EIO)), + } + } + fn kfpath(&self, file: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result { + let ctx = { context::current().read(token.token()).caller_ctx() }; + let mut address = self.inner.capture_user(buf, token)?; + let result = self + .inner + .call( + ctx, + Vec::new(), + Opcode::Fpath, + [file, address.base(), address.len()], + address.span(), + token, + )? + .into_regular(); + address.release(token)?; + result + } + + fn kreadoff( + &self, + file: usize, + buf: UserSliceWo, + offset: u64, + call_flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let ctx = { context::current().read(token.token()).caller_ctx() }; + + let mut address = self.inner.capture_user(buf, token)?; + let result = self + .inner + .call( + ctx, + Vec::new(), + Opcode::Read, + [ + file as u64, + address.base() as u64, + address.len() as u64, + offset, + u64::from(call_flags), + ], + address.span(), + token, + )? + .into_regular(); + address.release(token)?; + + result + } + + fn kwriteoff( + &self, + file: usize, + buf: UserSliceRo, + offset: u64, + call_flags: u32, + _stored_flags: u32, + token: &mut CleanLockToken, + ) -> Result { + let ctx = { context::current().read(token.token()).caller_ctx() }; + + let mut address = self.inner.capture_user(buf, token)?; + let result = self + .inner + .call( + ctx, + Vec::new(), + Opcode::Write, + [ + file as u64, + address.base() as u64, + address.len() as u64, + offset, + u64::from(call_flags), + ], + address.span(), + token, + )? + .into_regular(); + address.release(token)?; + + result + } + fn kfutimens( + &self, + file: usize, + buf: UserSliceRo, + token: &mut CleanLockToken, + ) -> Result { + let ctx = { context::current().read(token.token()).caller_ctx() }; + let mut address = self.inner.capture_user(buf, token)?; + let result = self + .inner + .call( + ctx, + Vec::new(), + Opcode::Futimens, + [file, address.base(), address.len()], + address.span(), + token, + )? + .into_regular(); + address.release(token)?; + result + } + fn getdents( + &self, + file: usize, + buf: UserSliceWo, + header_size: u16, + opaque_id_start: u64, + token: &mut CleanLockToken, + ) -> Result { + let ctx = { context::current().read(token.token()).caller_ctx() }; + let mut address = self.inner.capture_user(buf, token)?; + // TODO: Support passing the 16-byte record_len of the last dent, to make it possible to + // iterate backwards without first interating forward? The last entry will contain the + // opaque id to pass to the next getdents. Since this field is small, this would fit in the + // extra_raw field of `Cqe`s. + let result = self + .inner + .call( + ctx, + Vec::new(), + Opcode::Getdents, + [ + file, + address.base(), + address.len(), + header_size.into(), + opaque_id_start as usize, + ], + address.span(), + token, + )? + .into_regular(); + address.release(token)?; + result + } + fn kfstat(&self, file: usize, stat: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + let ctx = { context::current().read(token.token()).caller_ctx() }; + let mut address = self.inner.capture_user(stat, token)?; + let result = self + .inner + .call( + ctx, + Vec::new(), + Opcode::Fstat, + [file, address.base(), address.len()], + address.span(), + token, + )? + .into_regular(); + address.release(token)?; + result.map(|_| ()) + } + fn kfstatvfs(&self, file: usize, stat: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + let ctx = { context::current().read(token.token()).caller_ctx() }; + let mut address = self.inner.capture_user(stat, token)?; + let result = self + .inner + .call( + ctx, + Vec::new(), + Opcode::Fstatvfs, + [file, address.base(), address.len()], + address.span(), + token, + )? + .into_regular(); + address.release(token)?; + result.map(|_| ()) + } + fn kfmap( + &self, + file: usize, + addr_space: &Arc, + map: &Map, + _consume: bool, + token: &mut CleanLockToken, + ) -> Result { + self.inner + .fmap_inner(Arc::clone(addr_space), file, map, token) + } + fn kfunmap( + &self, + number: usize, + offset: usize, + size: usize, + flags: MunmapFlags, + token: &mut CleanLockToken, + ) -> Result<()> { + let inner = self.inner.clone(); + + let ctx = { context::current().read(token.token()).caller_ctx() }; + let res = inner.call( + ctx, + Vec::new(), + Opcode::Munmap, + [number, size, flags.bits(), offset], + &mut PageSpan::empty(), + token, + )?; + + res.into_regular()?; + Ok(()) + } + fn kcall( + &self, + id: usize, + payload: UserSliceRw, + _flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + let inner = self.inner.clone(); + + let mut address = inner.capture_user(payload, token)?; + let ctx = { context::current().read(token.token()).caller_ctx() }; + + let mut sqe = Sqe { + opcode: Opcode::Call as u8, + sqe_flags: SqeFlags::empty(), + _rsvd: 0, + tag: inner.next_id(token)?, + caller: ctx.pid as u64, + args: [ + id as u64, + address.base() as u64, + address.len() as u64, + 0, + 0, + 0, + ], + }; + { + let dst = &mut sqe.args[3..]; + let len = dst.len().min(metadata.len()); + dst[..len].copy_from_slice(&metadata[..len]); + } + match inner.call_inner(Vec::new(), sqe, address.span(), token) { + Ok(res) => { + address.release(token)?; + res.into_regular() + } + Err(e) => { + let _ = address.release(token); + Err(e) + } + } + } + fn kstdfscall( + &self, + id: usize, + _kind: StdFsCallKind, + desc: Arc, + payload: UserSliceRw, + _flags: CallFlags, + metadata: StdFsCallMeta, + token: &mut CleanLockToken, + ) -> Result { + let inner = self.inner.clone(); + + let mut address = inner.capture_user(payload, token)?; + let ctx = { context::current().read(token.token()).caller_ctx() }; + + let mut sqe = Sqe { + opcode: Opcode::StdFsCall as u8, + sqe_flags: SqeFlags::empty(), + _rsvd: 0, + tag: inner.next_id(token)?, + caller: ctx.pid as u64, + args: [ + id as u64, + address.base() as u64, + address.len() as u64, + 0, + 0, + 0, + ], + }; + { + let dst = &mut sqe.args[3..]; + let len = dst.len().min(metadata.len()); + dst[..len].copy_from_slice(&metadata[..len]); + } + match inner.call_inner(Vec::new(), sqe, address.span(), token)? { + Response::Regular(res, _, notify_on_detach) => { + address.release(token)?; + desc.write(token.token()) + .internal_flags + .set(InternalFlags::NOTIFY_ON_NEXT_DETACH, notify_on_detach); + res + } + _ => { + let _ = address.release(token); + Err(Error::new(EIO)) + } + } + } + + fn kfdwrite( + &self, + number: usize, + descs: Vec>, + flags: CallFlags, + arg: u64, + _metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + let inner = self.inner.clone(); + + let mut sendfd_flags = SendFdFlags::empty(); + if flags.contains(CallFlags::FD_EXCLUSIVE) { + sendfd_flags |= SendFdFlags::EXCLUSIVE; + } + + let ctx = { context::current().read(token.token()).caller_ctx() }; + let len = descs.len(); + inner + .call( + ctx, + descs, + Opcode::Sendfd, + [number, sendfd_flags.bits(), arg as usize, len], + &mut PageSpan::empty(), + token, + )? + .into_regular() + } + fn kfdread( + &self, + id: usize, + payload: UserSliceRw, + flags: CallFlags, + _metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + let inner = self.inner.clone(); + if !payload.len().is_multiple_of(size_of::()) { + return Err(Error::new(EINVAL)); + } + + let mut recvfd_flags = RecvFdFlags::empty(); + if flags.contains(CallFlags::FD_UPPER) { + recvfd_flags |= RecvFdFlags::UPPER_TBL; + } + if flags.contains(CallFlags::FD_CLOEXEC) { + recvfd_flags |= RecvFdFlags::CLOEXEC; + } + + let ctx = { context::current().read(token.token()).caller_ctx() }; + let len = payload.len() / size_of::(); + let res = inner.call( + ctx, + Vec::new(), + Opcode::Recvfd, + [id, recvfd_flags.bits(), len], + &mut PageSpan::empty(), + token, + )?; + + let descriptions_opt = match res { + Response::Regular(res, _, _) => { + return match res { + Ok(_) => Err(Error::new(EIO)), + Err(e) => Err(e), + } + } + Response::Fd(_) => return Err(Error::new(EIO)), + Response::MultipleFds(fds) => fds, + }; + + let mut token = token.downgrade(); + let num_fds = if let Some(descriptions) = descriptions_opt { + if recvfd_flags.contains(RecvFdFlags::UPPER_TBL) { + bulk_insert_fds( + descriptions, + payload, + recvfd_flags.contains(RecvFdFlags::CLOEXEC), + &mut token, + )? + } else { + bulk_add_fds( + descriptions, + payload, + recvfd_flags.contains(RecvFdFlags::CLOEXEC), + &mut token, + )? + } + } else { + 0 + }; + + Ok(num_fds) + } + fn translate_std_fs_call( + &self, + id: usize, + desc: Arc, + payload: UserSliceRw, + flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, + ) -> Result { + let &[kind, arg1, arg2, ..] = metadata else { + return Err(Error::new(EINVAL)); + }; + let Some(kind) = StdFsCallKind::try_from_raw(kind as u8) else { + return Err(Error::new(EOPNOTSUPP)); + }; + let metadata = StdFsCallMeta::new(kind, arg1, arg2); + self.kstdfscall(id, kind, desc, payload, flags, metadata, token) + } +} + +trait Args: Copy { + fn args(self) -> [u64; 6]; +} +impl Args for [u64; N] { + fn args(self) -> [u64; 6] { + const { assert!(N <= 6) }; + core::array::from_fn(|i| self.get(i).copied().unwrap_or(0)) + } +} +impl Args for [usize; N] { + fn args(self) -> [u64; 6] { + self.map(|s| s as u64).args() + } +} + +// TODO: Find a better way to do authentication. No scheme call currently uses arg 5 but this will +// likely change. Ideally this mechanism would also allow the scheme to query the supplementary +// group list. +fn uid_gid_hack_merge([uid, gid]: [u32; 2]) -> u64 { + u64::from(uid) | (u64::from(gid) << 32) +} +fn current_uid_gid(token: &mut CleanLockToken) -> [u32; 2] { + let ctx = context::current(); + let p = &ctx.read(token.token()); + [p.euid, p.egid] +} diff --git a/src/startup/memory.rs b/src/startup/memory.rs new file mode 100644 index 0000000000..26922dde0a --- /dev/null +++ b/src/startup/memory.rs @@ -0,0 +1,447 @@ +use crate::{ + arch::CurrentRmmArch, + memory::PAGE_SIZE, + startup::{memory::BootloaderMemoryKind::Null, KernelArgs}, +}; +use core::{ + cell::SyncUnsafeCell, + cmp::{max, min}, + slice::{self, Iter}, +}; +use rmm::{ + Arch, BumpAllocator, MemoryArea, PageFlags, PageMapper, PhysicalAddress, TableKind, + VirtualAddress, KILOBYTE, MEGABYTE, +}; + +// Keep synced with OsMemoryKind in bootloader +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[repr(u64)] +#[allow(dead_code)] +pub enum BootloaderMemoryKind { + Null = 0, + Free = 1, + Reclaim = 2, + Reserved = 3, + + // These are local to kernel + Kernel = 0x100, + Device = 0x101, + IdentityMap = 0x102, +} + +// Keep synced with OsMemoryEntry in bootloader +#[derive(Clone, Copy, Debug)] +#[repr(C, packed(8))] +struct BootloaderMemoryEntry { + pub base: u64, + pub size: u64, + pub kind: BootloaderMemoryKind, +} + +#[derive(Clone, Copy, Debug)] +struct MemoryEntry { + pub start: usize, + pub end: usize, + pub kind: BootloaderMemoryKind, +} + +impl MemoryEntry { + fn intersect(&self, other: &Self) -> Option { + let start = max(self.start, other.start); + let end = min(self.end, other.end); + if start < end { + Some(Self { + start, + end, + kind: self.kind, + }) + } else { + None + } + } + + fn combine(&self, other: &Self) -> Option { + if self.start <= other.end && self.end >= other.start { + Some(Self { + start: min(self.start, other.start), + end: max(self.end, other.end), + kind: self.kind, + }) + } else { + None + } + } +} + +struct MemoryMap { + entries: [MemoryEntry; 512], + size: usize, +} + +impl MemoryMap { + fn register(&mut self, base: usize, size: usize, kind: BootloaderMemoryKind) { + if self.size >= self.entries.len() { + panic!("Early memory map overflow!"); + } + let start = if kind == BootloaderMemoryKind::Free { + align_up(base) + } else { + align_down(base) + }; + let end = base.saturating_add(size); + let end = if kind == BootloaderMemoryKind::Free { + align_down(end) + } else { + align_up(end) + }; + if start < end + && let Some(entry) = self.entries.get_mut(self.size) + { + *entry = MemoryEntry { start, end, kind }; + self.size += 1; + } + } + + fn iter(&self) -> Iter<'_, MemoryEntry> { + self.entries[0..self.size].iter() + } + + pub fn free(&self) -> impl Iterator { + self.iter().filter(|x| x.kind == BootloaderMemoryKind::Free) + } + + pub fn non_free(&self) -> impl Iterator { + self.iter().filter(|x| x.kind != BootloaderMemoryKind::Free) + } + + pub fn kernel(&self) -> Option<&MemoryEntry> { + self.iter().find(|x| x.kind == BootloaderMemoryKind::Kernel) + } + + pub fn devices(&self) -> impl Iterator { + self.iter() + .filter(|x| x.kind == BootloaderMemoryKind::Device) + } + + pub fn identity_mapped(&self) -> impl Iterator { + self.iter() + .filter(|x| x.kind == BootloaderMemoryKind::IdentityMap) + } +} + +static MEMORY_MAP: SyncUnsafeCell = SyncUnsafeCell::new(MemoryMap { + entries: [MemoryEntry { + start: 0, + end: 0, + kind: BootloaderMemoryKind::Null, + }; 512], + size: 0, +}); + +fn align_up(x: usize) -> usize { + (x.saturating_add(PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE +} +fn align_down(x: usize) -> usize { + x / PAGE_SIZE * PAGE_SIZE +} + +fn register_memory_from_kernel_args(args: &KernelArgs) { + register_bootloader_areas(args.areas_base as usize, args.areas_size as usize); + if let Some(dt) = args.dtb() { + crate::dtb::register_dev_memory_ranges(&dt); + } + register_memory_region( + args.kernel_base as usize, + args.kernel_size as usize, + BootloaderMemoryKind::Kernel, + ); + register_memory_region( + args.env_base as usize, + args.env_size as usize, + BootloaderMemoryKind::IdentityMap, + ); + register_memory_region( + args.hwdesc_base as usize, + args.hwdesc_size as usize, + BootloaderMemoryKind::IdentityMap, + ); + register_memory_region( + args.bootstrap_base as usize, + args.bootstrap_size as usize, + BootloaderMemoryKind::IdentityMap, + ); +} + +pub fn register_memory_region(base: usize, size: usize, kind: BootloaderMemoryKind) { + if kind != Null && size != 0 { + debug!("Registering {:?} memory {:X} size {:X}", kind, base, size); + unsafe { (*MEMORY_MAP.get()).register(base, size, kind) } + } +} + +fn register_bootloader_areas(areas_base: usize, areas_size: usize) { + let bootloader_areas = unsafe { + slice::from_raw_parts( + areas_base as *const BootloaderMemoryEntry, + areas_size / size_of::(), + ) + }; + for bootloader_area in bootloader_areas.iter() { + register_memory_region( + bootloader_area.base as usize, + bootloader_area.size as usize, + bootloader_area.kind, + ) + } +} + +unsafe fn add_memory(areas: &mut [MemoryArea], area_i: &mut usize, mut area: MemoryEntry) { + unsafe { + for reservation in (*MEMORY_MAP.get()).non_free() { + if area.end > reservation.start && area.end <= reservation.end { + info!( + "Memory {:X}:{:X} overlaps with reservation {:X}:{:X}", + area.start, area.end, reservation.start, reservation.end + ); + area.end = reservation.start; + } + if area.start >= area.end { + return; + } + + if area.start >= reservation.start && area.start < reservation.end { + info!( + "Memory {:X}:{:X} overlaps with reservation {:X}:{:X}", + area.start, area.end, reservation.start, reservation.end + ); + area.start = reservation.end; + } + if area.start >= area.end { + return; + } + + if area.start <= reservation.start && area.end > reservation.start { + info!( + "Memory {:X}:{:X} contains reservation {:X}:{:X}", + area.start, area.end, reservation.start, reservation.end + ); + debug_assert!(area.start < reservation.start && reservation.end < area.end, + "Should've contained reservation entirely: memory block {:X}:{:X} reservation {:X}:{:X}", + area.start, area.end, + reservation.start, reservation.end + ); + // recurse on first part of split memory block + + add_memory( + areas, + area_i, + MemoryEntry { + end: reservation.start, + ..area + }, + ); + + // and continue with the second part + area.start = reservation.end; + } + debug_assert!( + area.intersect(reservation).is_none(), + "Intersects with reservation! memory block {:X}:{:X} reservation {:X}:{:X}", + area.start, + area.end, + reservation.start, + reservation.end + ); + debug_assert!( + area.start < area.end, + "Empty memory block {:X}:{:X}", + area.start, + area.end + ); + } + + // Combine overlapping memory areas + let mut other_i = 0; + while other_i < *area_i { + let other = &areas[other_i]; + let other = MemoryEntry { + start: other.base.data(), + end: other.base.data().saturating_add(other.size), + kind: BootloaderMemoryKind::Free, + }; + if let Some(union) = area.combine(&other) { + debug!( + "{:X}:{:X} overlaps with area {:X}:{:X}, combining into {:X}:{:X}", + area.start, area.end, other.start, other.end, union.start, union.end + ); + area = union; + *area_i -= 1; // delete the original memory chunk + areas[other_i] = areas[*area_i]; + } else { + other_i = other_i.saturating_add(1); + } + } + + areas[*area_i].base = PhysicalAddress::new(area.start); + areas[*area_i].size = area.end.saturating_sub(area.start); + *area_i += 1; + } +} + +fn kernel_page_flags(virt: VirtualAddress) -> PageFlags { + use crate::kernel_executable_offsets::*; + let virt_addr = virt.data(); + + (if virt_addr >= __text_start() && virt_addr < __text_end() { + // Remap text read-only, execute + PageFlags::new().execute(true) + } else if virt_addr >= __rodata_start() && virt_addr < __rodata_end() { + // Remap rodata read-only, no execute + PageFlags::new() + } else { + // Remap everything else read-write, no execute + PageFlags::new().write(true) + }) + .global(cfg!(all(target_arch = "x86_64", not(feature = "pti")))) +} + +unsafe fn map_memory(areas: &[MemoryArea], mut bump_allocator: &mut BumpAllocator) { + unsafe { + let mut mapper = PageMapper::::create(TableKind::Kernel, &mut bump_allocator) + .expect("failed to create Mapper"); + + // Map all physical areas at PHYS_OFFSET + for area in areas.iter() { + for i in 0..area.size / PAGE_SIZE { + let phys = area.base.add(i * PAGE_SIZE); + let virt = A::phys_to_virt(phys); + let flags = kernel_page_flags::(virt); + let flush = mapper + .map_phys(virt, phys, flags) + .expect("failed to map frame"); + flush.ignore(); // Not the active table + } + } + + let kernel_area = (*MEMORY_MAP.get()).kernel().unwrap(); + let kernel_base = kernel_area.start; + let kernel_size = kernel_area.end.saturating_sub(kernel_area.start); + // Map kernel at KERNEL_OFFSET + for i in 0..kernel_size / A::PAGE_SIZE { + let phys = PhysicalAddress::new(kernel_base + i * PAGE_SIZE); + let virt = VirtualAddress::new( + crate::kernel_executable_offsets::KERNEL_OFFSET() + i * PAGE_SIZE, + ); + let flags = kernel_page_flags::(virt); + let flush = mapper + .map_phys(virt, phys, flags) + .expect("failed to map frame"); + flush.ignore(); // Not the active table + } + + for area in (*MEMORY_MAP.get()).identity_mapped() { + let base = area.start; + let size = area.end.saturating_sub(area.start); + for i in 0..size / PAGE_SIZE { + let phys = PhysicalAddress::new(base + i * PAGE_SIZE); + let virt = A::phys_to_virt(phys); + let flags = kernel_page_flags::(virt); + let flush = mapper + .map_phys(virt, phys, flags) + .expect("failed to map frame"); + flush.ignore(); // Not the active table + } + } + + //map dev mem + for area in (*MEMORY_MAP.get()).devices() { + let base = area.start; + let size = area.end.saturating_sub(area.start); + for i in 0..size / PAGE_SIZE { + let phys = PhysicalAddress::new(base + i * PAGE_SIZE); + let virt = A::phys_to_virt(phys); + let flags = kernel_page_flags::(virt).device_memory(true); + let flush = mapper + .map_phys(virt, phys, flags) + .expect("failed to map frame"); + flush.ignore(); // Not the active table + } + } + + // Ensure graphical debug region remains paged + { + use crate::devices::graphical_debug::FRAMEBUFFER; + + let (phys, virt, size) = *FRAMEBUFFER.lock(); + + let pages = size.div_ceil(PAGE_SIZE); + for i in 0..pages { + let phys = PhysicalAddress::new(phys + i * PAGE_SIZE); + let virt = VirtualAddress::new(virt + i * PAGE_SIZE); + let flags = PageFlags::new().write(true).write_combining(true); + let flush = mapper + .map_phys(virt, phys, flags) + .expect("failed to map frame"); + flush.ignore(); // Not the active table + } + } + + debug!("Table: {:X}", mapper.table().phys().data()); + mapper.table().debug_entries(|args| debug!("{args}")); + + // Use the new table + mapper.make_current(); + } +} + +pub unsafe fn init(args: &KernelArgs, low_limit: Option, high_limit: Option) { + register_memory_from_kernel_args(args); + + unsafe { + let physmem_limit = MemoryEntry { + start: align_up(low_limit.unwrap_or(0)), + end: align_down(high_limit.unwrap_or(usize::MAX)), + kind: BootloaderMemoryKind::Free, + }; + + let areas = &mut *crate::memory::AREAS.get(); + let mut area_i = 0; + + // Copy initial memory map, and page align it + for area in (*MEMORY_MAP.get()).free() { + debug!("{:X}:{:X}", area.start, area.end); + + if let Some(area) = area.intersect(&physmem_limit) { + add_memory(areas, &mut area_i, area); + } + } + + areas[..area_i].sort_unstable_by_key(|area| area.base); + crate::memory::AREA_COUNT.get().write(area_i as u16); + + // free memory map in now ready + let areas = crate::memory::areas(); + + // First, calculate how much memory we have + let mut size = 0_usize; + for area in areas.iter() { + if area.size > 0 { + debug!("{:X?}", area); + size = size.saturating_add(area.size); + } + } + + info!("Memory: {} MB", size.div_ceil(MEGABYTE)); + + // Create a basic allocator for the first pages + let mut bump_allocator = BumpAllocator::::new(areas, 0); + + map_memory(areas, &mut bump_allocator); + + // Create the physical memory map + let offset = bump_allocator.offset(); + info!("Permanently used: {} KB", offset.div_ceil(KILOBYTE)); + + crate::memory::init_mm(bump_allocator); + } +} diff --git a/src/startup/mod.rs b/src/startup/mod.rs new file mode 100644 index 0000000000..8ad3cdf7f8 --- /dev/null +++ b/src/startup/mod.rs @@ -0,0 +1,238 @@ +use core::{ + hint, slice, + sync::atomic::{AtomicBool, Ordering}, +}; + +use crate::{ + arch::interrupt, + context, + context::switch::SwitchResult, + memory::{PhysicalAddress, RmmA, RmmArch}, + profiling, scheme, + sync::CleanLockToken, +}; + +pub mod memory; + +#[repr(C, packed(8))] +pub(crate) struct KernelArgs { + kernel_base: u64, + kernel_size: u64, + + stack_base: u64, + stack_size: u64, + + env_base: u64, + env_size: u64, + + /// The base pointer to the saved RSDP or device tree blob. + /// + /// On x86 this field can be NULL, and if so, the system has not booted + /// with UEFI or in some other way retrieved the RSDPs. The kernel or a + /// userspace driver will thus try searching the BIOS memory instead. On + /// UEFI systems, searching is not guaranteed to actually work though. + /// On other architectures this field must always contain a pointer to + /// either an RSDP or device tree blob. + pub(crate) hwdesc_base: u64, + pub(crate) hwdesc_size: u64, + + areas_base: u64, + areas_size: u64, + + /// The physical base 64-bit pointer to the contiguous bootstrap/initfs. + bootstrap_base: u64, + /// Size of contiguous bootstrap/initfs physical region, not necessarily page aligned. + bootstrap_size: u64, +} + +impl KernelArgs { + pub(crate) fn print(&self) { + debug!( + "Kernel: {:X}:{:X}", + { self.kernel_base }, + self.kernel_base + self.kernel_size + ); + debug!( + "Env: {:X}:{:X}", + { self.env_base }, + self.env_base + self.env_size + ); + debug!( + "HWDESC: {:X}:{:X}", + { self.hwdesc_base }, + self.hwdesc_base + self.hwdesc_size + ); + debug!( + "Areas: {:X}:{:X}", + { self.areas_base }, + self.areas_base + self.areas_size + ); + debug!( + "Bootstrap: {:X}:{:X}", + { self.bootstrap_base }, + self.bootstrap_base + self.bootstrap_size + ); + } + + pub(crate) fn bootstrap(&self) -> Bootstrap { + Bootstrap { + base: crate::memory::Frame::containing(crate::memory::PhysicalAddress::new( + self.bootstrap_base as usize, + )), + page_count: (self.bootstrap_size as usize) / crate::memory::PAGE_SIZE, + env: self.env(), + } + } + + pub(crate) fn env(&self) -> &'static [u8] { + unsafe { + slice::from_raw_parts( + RmmA::phys_to_virt(PhysicalAddress::new(self.env_base as usize)).data() + as *const u8, + self.env_size as usize, + ) + } + } + + pub(crate) fn acpi_rsdp(&self) -> Option<*const u8> { + if self.hwdesc_base != 0 { + let data = unsafe { + slice::from_raw_parts( + RmmA::phys_to_virt(PhysicalAddress::new(self.hwdesc_base as usize)).data() + as *const u8, + self.hwdesc_size as usize, + ) + }; + if data.starts_with(b"RSD PTR ") { + Some(data.as_ptr()) + } else { + None + } + } else { + None + } + } + + pub(crate) fn dtb(&self) -> Option> { + if self.hwdesc_base != 0 { + let data = unsafe { + slice::from_raw_parts( + RmmA::phys_to_virt(PhysicalAddress::new(self.hwdesc_base as usize)).data() + as *const u8, + self.hwdesc_size as usize, + ) + }; + fdt::Fdt::new(data).ok() + } else { + None + } + } +} + +pub(crate) fn init_env() -> &'static [u8] { + BOOTSTRAP.get().expect("BOOTSTRAP was not set").env +} + +extern "C" fn userspace_init() { + let mut token = unsafe { CleanLockToken::new() }; + let bootstrap = BOOTSTRAP.get().expect("BOOTSTRAP was not set"); + unsafe { crate::syscall::process::usermode_bootstrap(bootstrap, &mut token) } +} + +pub(crate) struct Bootstrap { + pub(crate) base: crate::memory::Frame, + pub(crate) page_count: usize, + env: &'static [u8], +} + +static BOOTSTRAP: spin::Once = spin::Once::new(); +pub(crate) static AP_READY: AtomicBool = AtomicBool::new(false); +static BSP_READY: AtomicBool = AtomicBool::new(false); + +/// This is the kernel entry point for the primary CPU. The arch crate is responsible for calling this +pub(crate) fn kmain(bootstrap: Bootstrap) -> ! { + let mut token = unsafe { CleanLockToken::new() }; + + BSP_READY.store(true, Ordering::SeqCst); + + //Initialize the first context, stored in kernel/src/context/mod.rs + context::init(&mut token); + + //Initialize global schemes, such as `acpi:`. + scheme::init_globals(); + + debug!("BSP: {} CPUs", crate::cpu_count()); + debug!("Env: {:?}", ::core::str::from_utf8(bootstrap.env)); + + BOOTSTRAP.call_once(|| bootstrap); + + profiling::ready_for_profiling(); + + let owner = None; // kmain not owned by any fd + match context::spawn(true, owner, userspace_init, &mut token) { + Ok(context_lock) => { + let mut context = context_lock.write(token.token()); + context.status = context::Status::Runnable; + context.name.clear(); + context.name.push_str("[bootstrap]"); + + // TODO: Remove these from kernel + context.euid = 0; + context.egid = 0; + } + Err(err) => { + panic!("failed to spawn userspace_init: {:?}", err); + } + } + + run_userspace(&mut token) +} + +/// This is the main kernel entry point for secondary CPUs +#[allow(unreachable_code, unused_variables, dead_code)] +pub(crate) fn kmain_ap(cpu_id: crate::cpu_set::LogicalCpuId) -> ! { + let mut token = unsafe { CleanLockToken::new() }; + + AP_READY.store(true, Ordering::SeqCst); + while !BSP_READY.load(Ordering::SeqCst) { + hint::spin_loop(); + } + + profiling::maybe_run_profiling_helper_forever(cpu_id); + + if !cfg!(feature = "multi_core") { + debug!("AP {}: Disabled", cpu_id); + + loop { + unsafe { + interrupt::disable(); + interrupt::halt(); + } + } + } + + context::init(&mut token); + + debug!("AP {}", cpu_id); + + profiling::ready_for_profiling(); + + run_userspace(&mut token); +} + +fn run_userspace(token: &mut CleanLockToken) -> ! { + loop { + unsafe { + interrupt::disable(); + match context::switch(token) { + SwitchResult::Switched => { + interrupt::enable_and_nop(); + } + SwitchResult::AllContextsIdle => { + // Enable interrupts, then halt CPU (to save power) until the next interrupt is actually fired. + interrupt::enable_and_halt(); + } + } + } + } +} diff --git a/src/sync/mod.rs b/src/sync/mod.rs new file mode 100644 index 0000000000..6ad2708ba4 --- /dev/null +++ b/src/sync/mod.rs @@ -0,0 +1,5 @@ +pub use self::{ordered::*, wait_condition::WaitCondition, wait_queue::WaitQueue}; + +pub mod ordered; +pub mod wait_condition; +pub mod wait_queue; diff --git a/src/sync/ordered.rs b/src/sync/ordered.rs new file mode 100644 index 0000000000..91d46158db --- /dev/null +++ b/src/sync/ordered.rs @@ -0,0 +1,734 @@ +// This code was adapted from MIT licensed https://github.com/antialize/ordered-locks +// We cannot use that library directly as it is wrapping std::sync types + +#![allow(dead_code)] + +//! This crate implements compiletime ordering of locks into levels, [`L1`], [`L2`], [`L3`], [`L4`] and [`L5`]. +//! In order to acquire a lock at level `i` only locks at level `i-1` or below may be held. +//! +//! If locks are always acquired in level order on all threads, then one cannot have a deadlock +//! involving only acquired locks. +//! +//! In the following example we create two [mutexes](Mutex) at level [`L1`] and [`L2`] and lock them +//! in the propper order. +//! ``` +//! use ordered_locks::{L1, L2, Mutex, CleanLockToken}; +//! // Create value at lock level 0, this lock cannot be acquired while a level1 lock is heldt +//! let v1 = Mutex::::new(42); +//! // Create value at lock level 1 +//! let v2 = Mutex::::new(43); +//! // Construct a token indicating that this thread does not hold any locks +//! let mut token = unsafe {CleanLockToken::new()}; +//! +//! { +//! // We can acquire the locks for v1 and v2 at the same time +//! let mut g1 = v1.lock(token.token()); +//! let (g1, token) = g1.token_split(); +//! let mut g2 = v2.lock(token); +//! *g2 = 11; +//! *g1 = 12; +//! } +//! // Once the guards are dropped we can acquire other things +//! *v2.lock(token.token()) = 13; +//! ``` +//! +//! In the following example we create two [mutexes](Mutex) at level [`L1`] and [`L2`] and try to lock +//! the mutex at [`L1`] while already holding a [`Mutex`] at [`L2`] which failes to compile. +//! ```compile_fail +//! use ordered_locks::{L1, L2, Mutex, CleanLockToken}; +//! // Create value at lock level 0, this lock cannot be acquired while a level1 lock is heldt +//! let v1 = Mutex::::new(42); +//! // Create value at lock level 1 +//! let v2 = Mutex::::new(43); +//! // Construct a token indicating that this thread does not hold any locks +//! let mut clean_token = unsafe {CleanLockToken::new()}; +//! let token = clean_token.token(); +//! +//! // Try to aquire locks in the wrong order +//! let mut g2 = v2.lock(token); +//! let (g2, token) = g2.token_split(); +//! let mut g1 = v1.lock(token); // shouldn't compile! +//! *g2 = 11; +//! *g1 = 12; +//! ``` +use alloc::sync::Arc; +use core::marker::PhantomData; + +use crate::percpu::PercpuBlock; + +/// Lock level of a mutex +/// +/// While a mutex of L1 is locked on a thread, only mutexes of L2 or higher may be locked. +/// This lock hierarchy prevents deadlocks from occurring. For a deadlock to occur +/// We need some thread TA to hold a resource RA, and request a resource RB, while +/// another thread TB holds RB, and requests RA. This is not possible with a lock +/// hierarchy either RA or RB must be on a level that the other. +/// +/// At some point in time we would want Level to be replaced by usize, however +/// with current const generics (rust 1.55), we cannot compare const generic arguments +/// so we are left with this mess. +pub trait Level {} + +/// Indicate that the implementor is lower that the level O +pub trait Lower: Level {} + +/// Lowest locking level, no locks can be on this level +#[derive(Debug)] +pub struct L0 {} + +#[derive(Debug)] +pub struct L1 {} + +#[derive(Debug)] +pub struct L2 {} + +#[derive(Debug)] +pub struct L3 {} + +#[derive(Debug)] +pub struct L4 {} + +#[derive(Debug)] +pub struct L5 {} + +#[derive(Debug)] +pub struct L6 {} + +impl Level for L0 {} +impl Level for L1 {} +impl Level for L2 {} +impl Level for L3 {} +impl Level for L4 {} +impl Level for L5 {} +impl Level for L6 {} + +impl Lower for L0 {} +impl Lower for L0 {} +impl Lower for L0 {} +impl Lower for L0 {} +impl Lower for L0 {} +impl Lower for L0 {} + +impl Lower for L1 {} +impl Lower for L1 {} +impl Lower for L1 {} +impl Lower for L1 {} +impl Lower for L1 {} + +impl Lower for L2 {} +impl Lower for L2 {} +impl Lower for L2 {} +impl Lower for L2 {} + +impl Lower for L3 {} +impl Lower for L3 {} +impl Lower for L3 {} + +impl Lower for L4 {} +impl Lower for L4 {} + +impl Lower for L5 {} + +/// Indicate that the implementor is higher that the level O +pub trait Higher: Level {} +impl Higher for L1 where L2: Lower {} + +/// While this exists only locks with a level higher than L, may be locked. +/// These tokens are carried around the call stack to indicate the current locking level. +/// They have no size and should disappear at runtime. +pub struct LockToken<'a, L: Level>(PhantomData<&'a mut L>); + +impl<'a, L: Level> LockToken<'a, L> { + /// Create a borrowed copy of self + pub fn token(&mut self) -> LockToken<'_, L> { + LockToken(Default::default()) + } + + /// Create a borrowed copy of self, on a higher level + pub fn downgrade>(&mut self) -> LockToken<'_, LC> { + LockToken(Default::default()) + } + + pub fn downgraded>(_: LockToken<'a, LP>) -> Self { + LockToken(Default::default()) + } +} + +/// Token indicating that there are no acquired locks while not borrowed. +pub struct CleanLockToken(()); + +impl CleanLockToken { + /// Create a borrowed copy of self + pub fn token(&mut self) -> LockToken<'_, L0> { + LockToken(Default::default()) + } + + /// Create a borrowed copy of self, on a higher level + pub fn downgrade(&mut self) -> LockToken<'_, L> { + LockToken(Default::default()) + } + + /// Create a new instance + /// + /// # Safety + /// + /// This is safe to call as long as there are no currently acquired locks + /// in the thread/task, and as long as there are no other CleanLockToken + /// in the thread/task. + /// + /// A CleanLockToken + pub unsafe fn new() -> Self { + CleanLockToken(()) + } +} + +/// A mutual exclusion primitive useful for protecting shared data +/// +/// This mutex will block threads waiting for the lock to become available. The +/// mutex can also be statically initialized or created via a `new` +/// constructor. Each mutex has a type parameter which represents the data that +/// it is protecting. The data can only be accessed through the RAII guards +/// returned from `lock` and `try_lock`, which guarantees that the data is only +/// ever accessed when the mutex is locked. +#[derive(Debug)] +pub struct Mutex { + inner: spin::Mutex, + _phantom: PhantomData, +} + +impl Default for Mutex { + fn default() -> Self { + Self { + inner: Default::default(), + _phantom: Default::default(), + } + } +} + +#[cfg(feature = "busy_panic")] +pub const DEADLOCK_SPIN_CAP: usize = 5000; + +impl Mutex { + /// Creates a new mutex in an unlocked state ready for use + pub const fn new(val: T) -> Self { + Self { + inner: spin::Mutex::new(val), + _phantom: PhantomData, + } + } + + /// Acquires a mutex, blocking the current thread until it is able to do so. + /// + /// This function will block the local thread until it is available to acquire the mutex. + /// Upon returning, the thread is the only thread with the mutex held. + /// An RAII guard is returned to allow scoped unlock of the lock. When the guard goes out of scope, the mutex will be unlocked. + pub fn lock<'a, LP: Lower + 'a>( + &'a self, + lock_token: LockToken<'a, LP>, + ) -> MutexGuard<'a, L, T> { + let inner = { + #[cfg(feature = "busy_panic")] + let mut i = DEADLOCK_SPIN_CAP; + let my_percpu = PercpuBlock::current(); + + loop { + match self.inner.try_lock() { + Some(inner) => break inner, + None => { + my_percpu.maybe_handle_tlb_shootdown(); + core::hint::spin_loop(); + #[cfg(feature = "busy_panic")] + { + i -= 1; + if i == 0 { + panic!("Deadlock at mutex may have triggered") + } + } + } + } + } + }; + MutexGuard { + inner, + lock_token: LockToken::downgraded(lock_token), + } + } + + /// Attempts to acquire this lock. + /// + /// If the lock could not be acquired at this time, then `None` is returned. + /// Otherwise, an RAII guard is returned. The lock will be unlocked when the + /// guard is dropped. + /// + /// This function does not block. + pub fn try_lock<'a, LP: Lower + 'a>( + &'a self, + lock_token: LockToken<'a, LP>, + ) -> Option> { + self.inner.try_lock().map(|inner| MutexGuard { + inner, + lock_token: LockToken::downgraded(lock_token), + }) + } + + /// Arcquires the lock_token to replace older MutexGuard. + /// SAFETY: Caller must guarantee lock_token is coming from MutexWriteGuard::into_token() from the same lock. + /// OR Caller must guarantee lock_token is coming from different lock, which can happen when two lock need to copy data each other. + pub unsafe fn relock<'a>(&'a self, lock_token: LockToken<'a, L>) -> MutexGuard<'a, L, T> { + let inner = { + #[cfg(feature = "busy_panic")] + let mut i = DEADLOCK_SPIN_CAP; + let my_percpu = PercpuBlock::current(); + + loop { + match self.inner.try_lock() { + Some(inner) => break inner, + None => { + my_percpu.maybe_handle_tlb_shootdown(); + core::hint::spin_loop(); + #[cfg(feature = "busy_panic")] + { + i -= 1; + if i == 0 { + panic!("Deadlock at mutex may have triggered") + } + } + } + } + } + }; + MutexGuard { + inner, + lock_token: lock_token, + } + } + + /// Consumes this Mutex, returning the underlying data. + pub fn into_inner(self) -> T { + self.inner.into_inner() + } +} + +/// An RAII implementation of a "scoped lock" of a mutex. When this structure is +/// dropped (falls out of scope), the lock will be unlocked. +/// +/// The data protected by the mutex can be accessed through this guard via its +/// `Deref` and `DerefMut` implementations. +pub struct MutexGuard<'a, L: Level, T: ?Sized + 'a> { + inner: spin::MutexGuard<'a, T>, + lock_token: LockToken<'a, L>, +} + +impl<'a, L: Level, T: ?Sized + 'a> MutexGuard<'a, L, T> { + /// Split the guard into two parts, the first a mutable reference to the held content + /// the second a [`LockToken`] that can be used for further locking + pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) { + (&mut self.inner, self.lock_token.token()) + } + + /// Split the guard into two parts, the first is the owned content + /// the second a [`LockToken`] that can be used for further locking + pub fn into_split(self) -> (spin::MutexGuard<'a, T>, LockToken<'a, L>) { + (self.inner, self.lock_token) + } + + /// Merge the guard from `into_split` + pub fn from_split(lock: spin::MutexGuard<'a, T>, token: LockToken<'a, L>) -> Self { + Self { + inner: lock, + lock_token: token, + } + } +} + +impl<'a, L: Level, T: ?Sized + 'a> core::ops::Deref for MutexGuard<'a, L, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.inner.deref() + } +} +impl<'a, L: Level, T: ?Sized + 'a> core::ops::DerefMut for MutexGuard<'a, L, T> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.inner.deref_mut() + } +} + +#[derive(Debug)] +pub struct RwLock { + inner: spin::RwLock, + _phantom: PhantomData, +} + +impl Default for RwLock { + fn default() -> Self { + Self { + inner: Default::default(), + _phantom: Default::default(), + } + } +} + +/// A reader-writer lock +/// +/// This type of lock allows a number of readers or at most one writer at any point in time. +/// The write portion of this lock typically allows modification of the underlying data (exclusive access) +/// and the read portion of this lock typically allows for read-only access (shared access). +/// +/// The type parameter T represents the data that this lock protects. It is required that T satisfies +/// Send to be shared across threads and Sync to allow concurrent access through readers. +/// The RAII guards returned from the locking methods implement Deref (and DerefMut for the write methods) +/// to allow access to the container of the lock. +impl RwLock { + /// Creates a new instance of an RwLock which is unlocked. + pub const fn new(val: T) -> Self { + Self { + inner: spin::RwLock::new(val), + _phantom: PhantomData, + } + } + + /// Returns a mutable reference to the underlying data without locking. + /// The mutable borrow statically guarantees no locks exist thus safe to use. + pub fn get_mut(&mut self) -> &mut T { + self.inner.get_mut() + } + + /// Returns a mutable pointer to the underying data. + /// Writing to the data is undefined behavior unless locking is guaranteed by caller. + pub fn as_mut_ptr(&self) -> *mut T { + self.inner.as_mut_ptr() + } + + /// Consumes this RwLock, returning the underlying data. + pub fn into_inner(self) -> T { + self.inner.into_inner() + } + + /// Locks this RwLock with exclusive write access, blocking the current thread until it can be acquired. + /// This function will not return while other writers or other readers currently have access to the lock. + /// Returns an RAII guard which will drop the write access of this RwLock when dropped. + pub fn write<'a, LP: Lower + 'a>( + &'a self, + lock_token: LockToken<'a, LP>, + ) -> RwLockWriteGuard<'a, L, T> { + let inner = { + #[cfg(feature = "busy_panic")] + let mut i = DEADLOCK_SPIN_CAP; + let my_percpu = PercpuBlock::current(); + loop { + match self.inner.try_write() { + Some(inner) => break inner, + None => { + my_percpu.maybe_handle_tlb_shootdown(); + core::hint::spin_loop(); + #[cfg(feature = "busy_panic")] + { + i -= 1; + if i == 0 { + panic!("Deadlock at write may have triggered") + } + } + } + } + } + }; + RwLockWriteGuard { + inner, + lock_token: LockToken::downgraded(lock_token), + } + } + + /// Locks this RwLock with shared read access, blocking the current thread until it can be acquired. + /// + /// The calling thread will be blocked until there are no more writers which hold the lock. + /// There may be other readers currently inside the lock when this method returns. + /// + /// Note that attempts to recursively acquire a read lock on a RwLock when the current thread + /// already holds one may result in a deadlock. + /// + /// Returns an RAII guard which will release this thread’s shared access once it is dropped. + pub fn read<'a, LP: Lower + 'a>( + &'a self, + lock_token: LockToken<'a, LP>, + ) -> RwLockReadGuard<'a, L, T> { + let inner = { + #[cfg(feature = "busy_panic")] + let mut i = DEADLOCK_SPIN_CAP; + let my_percpu = PercpuBlock::current(); + loop { + match self.inner.try_read() { + Some(inner) => break inner, + None => { + my_percpu.maybe_handle_tlb_shootdown(); + core::hint::spin_loop(); + #[cfg(feature = "busy_panic")] + { + i -= 1; + if i == 0 { + panic!("Deadlock at read may have triggered") + } + } + } + } + } + }; + RwLockReadGuard { + inner, + lock_token: LockToken::downgraded(lock_token), + } + } + + pub fn upgradeable_read<'a, LP: Lower + 'a>( + &'a self, + lock_token: LockToken<'a, LP>, + ) -> RwLockUpgradableGuard<'a, L, T> { + let inner = { + #[cfg(feature = "busy_panic")] + let mut i = DEADLOCK_SPIN_CAP; + let my_percpu = PercpuBlock::current(); + loop { + match self.inner.try_upgradeable_read() { + Some(inner) => break inner, + None => { + my_percpu.maybe_handle_tlb_shootdown(); + core::hint::spin_loop(); + #[cfg(feature = "busy_panic")] + { + i -= 1; + if i == 0 { + panic!("Deadlock at upgradeable_read may have triggered") + } + } + } + } + } + }; + RwLockUpgradableGuard { + inner, + lock_token: LockToken::downgraded(lock_token), + } + } + + pub fn try_read<'a, LP: Lower + 'a>( + &'a self, + lock_token: LockToken<'a, LP>, + ) -> Option> { + let inner = self.inner.try_read()?; + Some(RwLockReadGuard { + inner, + lock_token: LockToken::downgraded(lock_token), + }) + } + + pub fn try_write<'a, LP: Lower + 'a>( + &'a self, + lock_token: LockToken<'a, LP>, + ) -> Option> { + let inner = self.inner.try_write()?; + Some(RwLockWriteGuard { + inner, + lock_token: LockToken::downgraded(lock_token), + }) + } + + /// Arcquires the lock_token to replace older LockWriteGuard. + /// SAFETY: Caller must guarantee lock_token is coming from RwLockWriteGuard::into_token() from the same lock. + /// OR Caller must guarantee lock_token is coming from different lock, which can happen when two lock need to copy data each other. + pub unsafe fn rewrite<'a>( + &'a self, + lock_token: LockToken<'a, L>, + ) -> RwLockWriteGuard<'a, L, T> { + let inner = { + #[cfg(feature = "busy_panic")] + let mut i = DEADLOCK_SPIN_CAP; + let my_percpu = PercpuBlock::current(); + loop { + match self.inner.try_write() { + Some(inner) => break inner, + None => { + my_percpu.maybe_handle_tlb_shootdown(); + core::hint::spin_loop(); + #[cfg(feature = "busy_panic")] + { + i -= 1; + if i == 0 { + panic!("Deadlock at write may have triggered") + } + } + } + } + } + }; + RwLockWriteGuard { inner, lock_token } + } + + /// Arcquires the lock_token to replace older LockUpgradableGuard. + /// SAFETY: Caller must guarantee lock_token is coming from RwLockUpgradableGuard::into_token() from the same lock. + pub unsafe fn reupgradeable_read<'a>( + &'a self, + lock_token: LockToken<'a, L>, + ) -> RwLockUpgradableGuard<'a, L, T> { + let inner = { + #[cfg(feature = "busy_panic")] + let mut i = DEADLOCK_SPIN_CAP; + let my_percpu = PercpuBlock::current(); + loop { + match self.inner.try_upgradeable_read() { + Some(inner) => break inner, + None => { + my_percpu.maybe_handle_tlb_shootdown(); + core::hint::spin_loop(); + #[cfg(feature = "busy_panic")] + { + i -= 1; + if i == 0 { + panic!("Deadlock at reupgradeable_read may have triggered") + } + } + } + } + } + }; + RwLockUpgradableGuard { inner, lock_token } + } + + // Unsafe due to not using token, currently required by context::switch + pub unsafe fn write_arc(self: &Arc) -> ArcRwLockWriteGuard { + core::mem::forget(self.inner.write()); + ArcRwLockWriteGuard { + rwlock: self.clone(), + } + } +} + +/// RAII structure used to release the exclusive write access of a lock when dropped +pub struct RwLockWriteGuard<'a, L: Level, T> { + inner: spin::RwLockWriteGuard<'a, T>, + lock_token: LockToken<'a, L>, +} + +impl<'a, L: Level, T> RwLockWriteGuard<'a, L, T> { + /// Split the guard into two parts, the first a mutable reference to the held content + /// the second a [`LockToken`] that can be used for further locking + pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) { + (&mut self.inner, self.lock_token.token()) + } + + /// Drop this Guard and extract the token to be reused for another write lock with rewrite() + pub fn into_token(self) -> LockToken<'a, L> { + drop(self.inner); + self.lock_token + } +} + +impl core::ops::Deref for RwLockWriteGuard<'_, L, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.inner.deref() + } +} + +impl core::ops::DerefMut for RwLockWriteGuard<'_, L, T> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.inner.deref_mut() + } +} + +/// RAII structure used to release the shared read access of a lock when dropped. +pub struct RwLockReadGuard<'a, L: Level, T> { + inner: spin::RwLockReadGuard<'a, T>, + lock_token: LockToken<'a, L>, +} + +impl RwLockReadGuard<'_, L, T> { + /// Split the guard into two parts, the first a reference to the held content + /// the second a [`LockToken`] that can be used for further locking + pub fn token_split(&mut self) -> (&T, LockToken<'_, L>) { + (&self.inner, self.lock_token.token()) + } +} + +impl core::ops::Deref for RwLockReadGuard<'_, L, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.inner.deref() + } +} + +/// RAII structure used to release the shared read access of a lock when dropped. Allows upgrade to RW lock +pub struct RwLockUpgradableGuard<'a, L: Level, T> { + inner: spin::RwLockUpgradableGuard<'a, T>, + lock_token: LockToken<'a, L>, +} + +impl<'a, L: Level, T> RwLockUpgradableGuard<'a, L, T> { + /// Split the guard into two parts, the first a reference to the held content + /// the second a [`LockToken`] that can be used for further locking + pub fn token_split(&mut self) -> (&T, LockToken<'_, L>) { + (&self.inner, self.lock_token.token()) + } + + /// Upgrade to RW lock + pub fn upgrade(self) -> RwLockWriteGuard<'a, L, T> { + RwLockWriteGuard { + inner: spin::RwLockUpgradableGuard::upgrade(self.inner), + lock_token: self.lock_token, + } + } + + /// Drop this Guard and extract the token to be reused for another write lock with reupgradeable_read() + pub fn into_token(self) -> LockToken<'a, L> { + drop(self.inner); + self.lock_token + } +} + +impl core::ops::Deref for RwLockUpgradableGuard<'_, L, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.inner.deref() + } +} + +pub struct ArcRwLockWriteGuard { + rwlock: Arc>, +} + +impl ArcRwLockWriteGuard { + pub fn rwlock(s: &Self) -> &Arc> { + &s.rwlock + } +} + +impl core::ops::Deref for ArcRwLockWriteGuard { + type Target = T; + + #[inline] + fn deref(&self) -> &Self::Target { + unsafe { &*self.rwlock.inner.as_mut_ptr() } + } +} + +impl core::ops::DerefMut for ArcRwLockWriteGuard { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + unsafe { &mut *self.rwlock.inner.as_mut_ptr() } + } +} + +impl Drop for ArcRwLockWriteGuard { + #[inline] + fn drop(&mut self) { + unsafe { + self.rwlock.inner.force_write_unlock(); + } + } +} + +/// This function can only be called if no lock is held by the calling thread/task +#[inline] +pub fn check_no_locks(_: LockToken<'_, L0>) {} diff --git a/src/sync/wait_condition.rs b/src/sync/wait_condition.rs new file mode 100644 index 0000000000..c508eb64fe --- /dev/null +++ b/src/sync/wait_condition.rs @@ -0,0 +1,147 @@ +use core::mem::ManuallyDrop; + +use alloc::{ + sync::{Arc, Weak}, + vec::Vec, +}; + +use crate::{ + context::{self, ContextLock, PreemptGuardL2}, + sync::{CleanLockToken, LockToken, Mutex, L1, L2, L3}, +}; + +#[derive(Debug)] +pub struct WaitCondition { + contexts: Mutex>>, +} + +impl WaitCondition { + pub const fn new() -> WaitCondition { + WaitCondition { + contexts: Mutex::new(Vec::new()), + } + } + + // Notify all waiters + pub fn notify(&self, token: &mut CleanLockToken) -> usize { + self.notify_locked(token.token().downgrade()) + } + + pub fn notify_locked(&self, token: LockToken<'_, L1>) -> usize { + let mut contexts = self.contexts.lock(token); + let (contexts, mut token) = contexts.token_split(); + let len = contexts.len(); + while let Some(context_weak) = contexts.pop() { + if let Some(context_ref) = context_weak.upgrade() { + context_ref.write(token.token()).unblock(); + } + } + len + } + + // Notify as though a signal woke the waiters + pub unsafe fn notify_signal(&self, token: LockToken<'_, L1>) -> usize { + let mut contexts = self.contexts.lock(token); + let (contexts, mut token) = contexts.token_split(); + let len = contexts.len(); + for context_weak in contexts.iter() { + if let Some(context_ref) = context_weak.upgrade() { + context_ref.write(token.token()).unblock(); + } + } + len + } + + /// Wait until notified. Unlocks guard when blocking is ready. Returns false if resumed by a signal or the notify_signal function. + /// SAFETY: Caller MUST ensure the given token is coming from the guard. There is no compiler check to do it. + pub fn wait<'a, T>( + &self, + guard: T, + reason: &'static str, + token: &'a mut LockToken<'a, L1>, + ) -> bool { + let mut token = token.downgrade(); + self.wait_inner(guard, reason, &mut token) + } + + pub fn wait_inner<'a, T>( + &self, + guard: T, + reason: &'static str, + token: &'a mut LockToken<'a, L2>, + ) -> bool { + let current_context_ref = context::current(); + { + // Avoid a context switch between blocking ourselves and adding + // ourselves to the wait list as otherwise we might miss a wakeup. + // We cannot add ourselves to the wait list first as that would lead + // to deadlock if we were woken up immediately. + let mut token = token.token(); + let mut preempt = PreemptGuardL2::new(¤t_context_ref, &mut token); + let token = preempt.token(); + { + let mut context = current_context_ref.write(token.token()); + if let Some((control, pctl, _)) = context.sigcontrol() + && control.currently_pending_unblocked(pctl) != 0 + { + return false; + } + context.block(reason); + } + + self.contexts + .lock(token.token()) + .push(Arc::downgrade(¤t_context_ref)); + + drop(guard); + } + + { + // SAFETY: Guaranteed by caller + let token = unsafe { &mut CleanLockToken::new() }; + context::switch(token); + } + + let mut waited = true; + + { + let mut contexts = self.contexts.lock(token.token()); + + if let Some(index) = contexts + .iter() + .position(|c| Weak::as_ptr(c) == Arc::as_ptr(¤t_context_ref)) + { + contexts.remove(index); + waited = false; + } + } + + waited + } + + pub fn into_drop(self, token: &mut CleanLockToken) { + self.into_drop_locked(token.token().downgrade()); + } + + pub fn into_drop_locked(self, token: LockToken<'_, L1>) { + ManuallyDrop::new(self).inner_drop(token); + } + + fn inner_drop(&mut self, token: LockToken<'_, L1>) { + unsafe { + self.notify_signal(token); + } + } +} + +impl Drop for WaitCondition { + fn drop(&mut self) { + //TODO: drop violates lock tokens + let mut token = unsafe { CleanLockToken::new() }; + self.inner_drop(token.downgrade()); + #[cfg(feature = "drop_panic")] + { + panic!("WaitCondition dropped"); + } + } +} diff --git a/src/sync/wait_queue.rs b/src/sync/wait_queue.rs new file mode 100644 index 0000000000..7e2c21e2df --- /dev/null +++ b/src/sync/wait_queue.rs @@ -0,0 +1,106 @@ +use alloc::collections::VecDeque; +use syscall::{EAGAIN, EINTR}; + +use crate::{ + sync::{CleanLockToken, LockToken, Mutex, MutexGuard, WaitCondition, L1, L2, L3}, + syscall::{ + error::{Error, Result, EINVAL}, + usercopy::UserSliceWo, + }, +}; + +#[derive(Debug)] +pub struct WaitQueue { + incoming: Mutex>, + outgoing: Mutex>, + pub condition: WaitCondition, +} + +impl WaitQueue { + pub const fn new() -> WaitQueue { + WaitQueue { + incoming: Mutex::new(VecDeque::new()), + outgoing: Mutex::new(VecDeque::new()), + condition: WaitCondition::new(), + } + } + + pub fn is_currently_empty(&self, token: &mut CleanLockToken) -> bool { + self.incoming.lock(token.token()).is_empty() && self.outgoing.lock(token.token()).is_empty() + } + + pub fn receive_into_user( + &self, + buf: UserSliceWo, + block: bool, + reason: &'static str, + token: &mut CleanLockToken, + ) -> Result { + let mut out_guard = self.outgoing.lock(token.token()); + loop { + let (mut outgoing, mut token) = out_guard.into_split(); + if !outgoing.is_empty() { + let (s1, s2) = outgoing.as_slices(); + let s1_bytes = unsafe { + core::slice::from_raw_parts(s1.as_ptr().cast::(), size_of_val(s1)) + }; + let s2_bytes = unsafe { + core::slice::from_raw_parts(s2.as_ptr().cast::(), size_of_val(s2)) + }; + + let mut bytes_copied = buf.copy_common_bytes_from_slice(s1_bytes)?; + + if let Some(buf_for_s2) = buf.advance(s1_bytes.len()) { + bytes_copied += buf_for_s2.copy_common_bytes_from_slice(s2_bytes)?; + } + + let _ = outgoing.drain(..bytes_copied / size_of::()); + return Ok(bytes_copied); + } + + let mut incoming = self.incoming.lock(token.token()); + + if incoming.is_empty() { + if block { + drop(incoming); + // SAFETY: Uses wait_inner because this inner is L2. It's guaranteed there's no other + // lock held at this point because clean token is provided from caller. + if !self + .condition + .wait_inner(outgoing, reason, &mut token.token()) + { + return Err(Error::new(EINTR)); + } + out_guard = unsafe { self.outgoing.relock(token) }; + continue; + } else if buf.is_empty() { + return Ok(0); + } else if buf.len() < size_of::() { + return Err(Error::new(EINVAL)); + } else { + // TODO: EWOULDBLOCK? + return Err(Error::new(EAGAIN)); + } + } + + core::mem::swap(&mut *incoming, &mut outgoing); + drop(incoming); + + out_guard = MutexGuard::from_split(outgoing, token); + } + } + + pub fn send(&self, value: T, token: &mut CleanLockToken) -> usize { + self.send_locked(value, token.token().downgrade()) + } + + pub fn send_locked(&self, value: T, mut token: LockToken<'_, L1>) -> usize { + let len = { + let mut inner = self.incoming.lock(token.token()); + inner.push_back(value); + inner.len() + }; + self.condition.notify_locked(token); + len + } +} diff --git a/src/syscall/debug.rs b/src/syscall/debug.rs new file mode 100644 index 0000000000..853974d95b --- /dev/null +++ b/src/syscall/debug.rs @@ -0,0 +1,316 @@ +use alloc::{borrow::ToOwned, string::String, vec::Vec}; +use core::{ascii, fmt::Debug}; + +use super::{ + copy_path_to_buf, + data::{Map, Stat, TimeSpec}, + flag::*, + number::*, + usercopy::UserSlice, +}; + +use crate::{sync::CleanLockToken, syscall::error::Result}; + +struct ByteStr<'a>(&'a [u8]); + +impl Debug for ByteStr<'_> { + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + write!(f, "\"")?; + for i in self.0 { + for ch in ascii::escape_default(*i) { + write!(f, "{}", ch as char)?; + } + } + write!(f, "\"")?; + Ok(()) + } +} +fn debug_path(ptr: usize, len: usize) -> Result { + // TODO: PATH_MAX + UserSlice::ro(ptr, len).and_then(|slice| copy_path_to_buf(slice, 4096)) +} +fn debug_buf(ptr: usize, len: usize) -> Result> { + UserSlice::ro(ptr, len).and_then(|user| { + let mut buf = vec![0_u8; 4096]; + let count = user.copy_common_bytes_to_slice(&mut buf)?; + buf.truncate(count); + Ok(buf) + }) +} +unsafe fn read_struct(ptr: usize) -> Result { + unsafe { UserSlice::ro(ptr, size_of::()).and_then(|slice| slice.read_exact::()) } +} + +//TODO: calling format_call with arguments from another process space will not work +pub fn format_call(a: usize, b: usize, c: usize, d: usize, e: usize, f: usize, g: usize) -> String { + match a { + SYS_OPENAT => format!( + "openat({} {:?}, {:#0x}, {}, {})", + b, + debug_path(c, d).as_ref().map(|p| ByteStr(p.as_bytes())), + e, + f, + g + ), + SYS_UNLINKAT => format!( + "unlinkat({} {:?}, {:#0x}, {}, {})", + b, + debug_path(c, d).as_ref().map(|p| ByteStr(p.as_bytes())), + e, + f, + g, + ), + SYS_CLOSE => format!("close({})", b), + SYS_DUP => format!( + "dup({}, {:?})", + b, + debug_buf(c, d).as_ref().map(|b| ByteStr(b)), + ), + SYS_DUP2 => format!( + "dup2({}, {}, {:?})", + b, + c, + debug_buf(d, e).as_ref().map(|b| ByteStr(b)), + ), + SYS_SENDFD => format!("sendfd({}, {}, {:#0x} {:#0x} {:#0x})", b, c, d, e, f,), + SYS_READ => format!("read({}, {:#X}, {})", b, c, d), + SYS_READ2 => format!( + "read2({}, {:#X}, {}, {}, {:?})", + b, + c, + d, + e, + (f != usize::MAX).then_some(RwFlags::from_bits_retain(f as u32)) + ), + SYS_WRITE => format!("write({}, {:#X}, {})", b, c, d), + SYS_WRITE2 => format!( + "write2({}, {:#X}, {}, {}, {:?})", + b, + c, + d, + e, + (f != usize::MAX).then_some(RwFlags::from_bits_retain(f as u32)) + ), + SYS_LSEEK => format!( + "lseek({}, {}, {} ({}))", + b, + c as isize, + match d { + SEEK_SET => "SEEK_SET", + SEEK_CUR => "SEEK_CUR", + SEEK_END => "SEEK_END", + _ => "UNKNOWN", + }, + d + ), + SYS_FCHMOD => format!("fchmod({}, {:#o})", b, c), + SYS_FCHOWN => format!("fchown({}, {}, {})", b, c, d), + SYS_FCNTL => format!( + "fcntl({}, {} ({}), {:#X})", + b, + match c { + F_DUPFD => "F_DUPFD", + F_GETFD => "F_GETFD", + F_SETFD => "F_SETFD", + F_SETFL => "F_SETFL", + F_GETFL => "F_GETFL", + F_DUPFD_CLOEXEC => "F_DUPFD_CLOEXEC", + _ => "UNKNOWN", + }, + c, + d + ), + SYS_FMAP => format!( + "fmap({}, {:?})", + b, + UserSlice::ro(c, d).and_then(|buf| unsafe { buf.read_exact::() }), + ), + SYS_FUNMAP => format!("funmap({:#X}, {:#X})", b, c,), + SYS_FLINK => format!("flink({}, {:?})", b, debug_path(c, d),), + SYS_FPATH => format!("fpath({}, {:#X}, {})", b, c, d), + SYS_FRENAME => format!("frename({}, {:?})", b, debug_path(c, d),), + SYS_FSTAT => format!( + "fstat({}, {:?})", + b, + UserSlice::ro(c, d).and_then(|buf| unsafe { buf.read_exact::() }), + ), + SYS_FSTATVFS => format!("fstatvfs({}, {:#X}, {})", b, c, d), + SYS_FSYNC => format!("fsync({})", b), + SYS_FTRUNCATE => format!("ftruncate({}, {})", b, c), + SYS_FUTIMENS => format!( + "futimens({}, {:?})", + b, + UserSlice::ro(c, d).and_then(|buf| { + let mut times = vec![unsafe { buf.read_exact::()? }]; + + // One or two timespecs + if let Some(second) = buf.advance(size_of::()) { + times.push(unsafe { second.read_exact::()? }); + } + Ok(times) + }), + ), + SYS_CALL => format!( + "call({b}, {c:x}+{d}, {:?}, {:0x?}", + CallFlags::from_bits_retain(e & !0xff), + // TODO: u64 + UserSlice::ro(f, (e & 0xff) * 8) + .and_then(|buf| buf.usizes().collect::>>()), + ), + + SYS_CLOCK_GETTIME => format!("clock_gettime({}, {:?})", b, unsafe { + read_struct::(c) + }), + SYS_FUTEX => format!( + "futex({:#X} [{:?}], {}, {}, {}, {})", + b, + UserSlice::ro(b, 4).and_then(|buf| buf.read_u32()), + c, + d, + e, + f + ), + SYS_MKNS => format!( + "mkns({:p} len: {})", + // TODO: Print out all scheme names? + + // Simply printing out simply the pointers and lengths may not provide that much useful + // debugging information, so only print the raw args. + b as *const u8, + c, + ), + SYS_MPROTECT => format!("mprotect({:#X}, {}, {:?})", b, c, MapFlags::from_bits(d)), + SYS_MREMAP => format!("mremap({:#X}, {:#X}, {:#X}, {:#X}, {:#X})", b, c, d, e, f), + SYS_NANOSLEEP => format!( + "nanosleep({:?}, ({}, {}))", + unsafe { read_struct::(b) }, + c, + d + ), + SYS_YIELD => "yield()".to_owned(), + _ => format!( + "UNKNOWN{} {:#X}({:#X}, {:#X}, {:#X}, {:#X}, {:#X}, {:#X})", + a, a, b, c, d, e, f, g + ), + } +} + +#[derive(Clone, Copy, Debug, Default)] +pub struct SyscallDebugInfo { + this_switch_time: u128, + accumulated_time: u128, + do_debug: bool, +} +impl SyscallDebugInfo { + pub const fn default() -> Self { + Self { + this_switch_time: 0, + accumulated_time: 0, + do_debug: false, + } + } + + #[cfg(feature = "syscall_debug")] + pub fn on_switch_from(&mut self, token: &mut CleanLockToken) { + let now = crate::time::monotonic(token); + self.accumulated_time += now - core::mem::replace(&mut self.this_switch_time, now); + } + #[cfg(feature = "syscall_debug")] + pub fn on_switch_to(&mut self, token: &mut CleanLockToken) { + self.this_switch_time = crate::time::monotonic(token); + } +} + +#[cfg_attr(feature = "syscall_debug", inline)] +pub fn debug_start([a, b, c, d, e, f, g]: [usize; 7], token: &mut CleanLockToken) { + if cfg!(not(feature = "syscall_debug")) { + return; + } + + #[expect(clippy::overly_complex_bool_expr)] + #[expect(clippy::needless_bool)] + let do_debug = if false + && crate::context::current() + .read(token.token()) + .name + .contains("init") + { + if a == SYS_CLOCK_GETTIME || a == SYS_YIELD || a == SYS_FUTEX { + false + } else if (a == SYS_WRITE || a == SYS_FSYNC) && (b == 1 || b == 2) { + false + } else { + true + } + } else { + false + }; + + let debug_start = if do_debug { + let context_lock = crate::context::current(); + { + let context = context_lock.read(token.token()); + print!("{} (*{}*): ", context.name, context.pid,); + } + + // Do format_call outside print! so possible exception handlers cannot reentrantly + // deadlock. + let string = format_call(a, b, c, d, e, f, g); + println!("{}", string); + + crate::time::monotonic(token) + } else { + 0 + }; + + crate::percpu::PercpuBlock::current() + .syscall_debug_info + .set(SyscallDebugInfo { + accumulated_time: 0, + this_switch_time: debug_start, + do_debug, + }); +} + +#[cfg_attr(feature = "syscall_debug", inline)] +pub fn debug_end( + [a, b, c, d, e, f, g]: [usize; 7], + result: Result, + token: &mut CleanLockToken, +) { + if cfg!(not(feature = "syscall_debug")) { + return; + } + + let debug_info = crate::percpu::PercpuBlock::current() + .syscall_debug_info + .take(); + + if !debug_info.do_debug { + return; + } + let debug_duration = + debug_info.accumulated_time + (crate::time::monotonic(token) - debug_info.this_switch_time); + + let context_lock = crate::context::current(); + { + let context = context_lock.read(token.token()); + print!("{} (*{}*): ", context.name, context.pid,); + } + + // Do format_call outside print! so possible exception handlers cannot reentrantly + // deadlock. + let string = format_call(a, b, c, d, e, f, g); + print!("{} = ", string); + + match result { + Ok(ref ok) => { + print!("Ok({} ({:#X}))", ok, ok); + } + Err(ref err) => { + print!("Err({} ({:#X}))", err, err.errno); + } + } + + println!(" in {} ns", debug_duration); +} diff --git a/src/syscall/fs.rs b/src/syscall/fs.rs new file mode 100644 index 0000000000..bf984641f4 --- /dev/null +++ b/src/syscall/fs.rs @@ -0,0 +1,779 @@ +//! Filesystem syscalls + +use core::num::NonZeroUsize; + +use alloc::{string::String, sync::Arc, vec::Vec}; +use redox_path::RedoxPath; + +use crate::{ + context::{ + self, + file::{FileDescription, FileDescriptor, InternalFlags, LockedFileDescription}, + memory::{AddrSpace, GenericFlusher, Grant, PageSpan, TlbShootdownActions}, + }, + memory::{Page, VirtualAddress, PAGE_SIZE}, + scheme::{self, FileHandle, KernelScheme, OpenResult, StrOrBytes}, + sync::{CleanLockToken, RwLock}, + syscall::{data::Stat, error::*, flag::*}, +}; + +use super::usercopy::{UserSlice, UserSliceRo, UserSliceRw, UserSliceWo}; + +pub fn file_op_generic( + fd: FileHandle, + token: &mut CleanLockToken, + op: impl FnOnce(&dyn KernelScheme, usize, &mut CleanLockToken) -> Result, +) -> Result { + file_op_generic_ext(fd, token, |s, _, desc, token| op(s, desc.number, token)) +} +pub fn file_op_generic_ext( + fd: FileHandle, + token: &mut CleanLockToken, + op: impl FnOnce( + &dyn KernelScheme, + Arc, + FileDescription, + &mut CleanLockToken, + ) -> Result, +) -> Result { + let (file, desc) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + let file = context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?; + let desc = *file.description.read(token.token()); + (file, desc) + }; + + let scheme = scheme::get_scheme(token.token(), desc.scheme)?; + + op(&*scheme, file.description, desc, token) +} +pub fn copy_path_to_buf(raw_path: UserSliceRo, max_len: usize) -> Result { + let mut path_buf = vec![0_u8; max_len]; + if raw_path.len() > path_buf.len() { + return Err(Error::new(ENAMETOOLONG)); + } + let path_len = raw_path.copy_common_bytes_to_slice(&mut path_buf)?; + path_buf.truncate(path_len); + String::from_utf8(path_buf).map_err(|_| Error::new(EINVAL)) + //core::str::from_utf8(&path_buf[..path_len]).map_err(|_| Error::new(EINVAL)) +} +// TODO: Define elsewhere +const PATH_MAX: usize = PAGE_SIZE; + +pub fn openat( + fh: FileHandle, + raw_path: UserSliceRo, + flags: usize, + fcntl_flags: u32, + euid: u32, + egid: u32, + token: &mut CleanLockToken, +) -> Result { + let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?; + + let (scheme_id, number) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?; + let desc = pipe.description.read(token.token()); + (desc.scheme, desc.number) + }; + + let caller_ctx = context::current() + .read(token.token()) + .caller_ctx() + .filter_uid_gid(euid, egid); + + let new_description = { + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + + let res = scheme.kopenat( + number, + StrOrBytes::from_str(&path_buf), + flags, + fcntl_flags, + caller_ctx, + token, + ); + + match res? { + OpenResult::SchemeLocal(number, internal_flags) => { + Arc::new(RwLock::new(FileDescription { + offset: 0, + internal_flags, + scheme: scheme_id, + number, + flags: (flags & !O_CLOEXEC) as u32, + })) + } + OpenResult::External(desc) => desc, + } + }; + + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + context + .add_file( + FileDescriptor { + description: new_description, + cloexec: flags & O_CLOEXEC == O_CLOEXEC, + }, + &mut token, + ) + .ok_or(Error::new(EMFILE)) +} +/// Unlinkat syscall +pub fn unlinkat( + fh: FileHandle, + raw_path: UserSliceRo, + flags: usize, + euid: u32, + egid: u32, + token: &mut CleanLockToken, +) -> Result<()> { + let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?; + + let (number, scheme_id) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?; + let desc = pipe.description.read(token.token()); + (desc.number, desc.scheme) + }; + + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + + let caller_ctx = context::current() + .read(token.token()) + .caller_ctx() + .filter_uid_gid(euid, egid); + + /* + let mut path_buf = BorrowedHtBuf::head()?; + let path = path_buf.use_for_string(raw_path)?; + */ + scheme.unlinkat(number, &path_buf, flags, caller_ctx, token) +} + +/// Close syscall +pub fn close(fd: FileHandle, token: &mut CleanLockToken) -> Result<()> { + let file = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + context + .remove_file(fd, &mut token) + .ok_or(Error::new(EBADF))? + }; + + file.close(token) +} + +fn duplicate_file( + fd: FileHandle, + user_buf: UserSliceRo, + cloexec: bool, + token: &mut CleanLockToken, +) -> Result { + let (caller_ctx, file) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + ( + context.caller_ctx(), + context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?, + ) + }; + + if user_buf.is_empty() { + Ok(FileDescriptor { + description: Arc::clone(&file.description), + cloexec, + }) + } else { + let description = { *file.description.read(token.token()) }; + + let new_description = { + let scheme = scheme::get_scheme(token.token(), description.scheme)?; + + match scheme.kdup(description.number, user_buf, caller_ctx, token)? { + OpenResult::SchemeLocal(number, internal_flags) => { + Arc::new(RwLock::new(FileDescription { + offset: 0, + internal_flags, + scheme: description.scheme, + number, + flags: description.flags, + })) + } + OpenResult::External(desc) => desc, + } + }; + + Ok(FileDescriptor { + description: new_description, + cloexec, + }) + } +} + +/// Duplicate file descriptor +pub fn dup(fd: FileHandle, buf: UserSliceRo, token: &mut CleanLockToken) -> Result { + let new_file = duplicate_file(fd, buf, false, token)?; + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + context + .add_file(new_file, &mut token) + .ok_or(Error::new(EMFILE)) +} + +/// Duplicate file descriptor, replacing another +pub fn dup2( + fd: FileHandle, + new_fd: FileHandle, + buf: UserSliceRo, + token: &mut CleanLockToken, +) -> Result { + if fd == new_fd { + Ok(new_fd) + } else { + let _ = close(new_fd, token); + let new_file = duplicate_file(fd, buf, false, token)?; + + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + context + .insert_file(new_fd, new_file, &mut token) + .ok_or(Error::new(EMFILE)) + } +} +pub fn call( + fd: FileHandle, + payload: UserSliceRw, + flags: CallFlags, + metadata: UserSliceRo, + token: &mut CleanLockToken, +) -> Result { + let mut meta = [0_u64; 3]; + + // TODO: bytemuck/plain + let copied = metadata.copy_common_bytes_to_slice(unsafe { + core::slice::from_raw_parts_mut(meta.as_mut_ptr().cast(), meta.len() * 8) + })?; + + match flags { + f if f.contains(CallFlags::WRITE | CallFlags::FD) => { + call_fdwrite(fd, payload, flags, &meta[..copied / 8], token) + } + f if f.contains(CallFlags::READ | CallFlags::FD) => { + call_fdread(fd, payload, flags, &meta[..copied / 8], token) + } + _ => call_normal(fd, payload, flags, &meta[..copied / 8], token), + } +} + +fn call_normal( + fd: FileHandle, + payload: UserSliceRw, + flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, +) -> Result { + let file = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + match (current.token_split(), flags.contains(CallFlags::CONSUME)) { + ((ctxt, mut token), true) => ctxt.remove_file(fd, &mut token), + ((ctxt, mut token), false) => ctxt.get_file(fd, &mut token), + } + } + .ok_or(Error::new(EBADF))?; + + let (scheme_id, number) = { + let desc = file.description.read(token.token()); + (desc.scheme, desc.number) + }; + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + + if flags.contains(CallFlags::STD_FS) { + scheme.translate_std_fs_call(number, file.description, payload, flags, metadata, token) + } else { + scheme.kcall(number, payload, flags, metadata, token) + } +} + +fn call_fdwrite( + fd: FileHandle, + payload: UserSliceRw, + flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, +) -> Result { + let payload_chunks = payload.in_exact_chunks(size_of::()); + let fds = payload_chunks + .map(|chunk| { + let fd = chunk.read_usize()?; + Ok(FileHandle::from(fd)) + }) + .collect::>>()?; + + let len = fds.len(); + + fdwrite_inner(fd, fds, flags, 0, metadata, token)?; + + Ok(len) +} + +fn fdwrite_inner( + socket: FileHandle, + target_fds: Vec, + flags: CallFlags, + arg: u64, + metadata: &[u64], + token: &mut CleanLockToken, +) -> Result { + // TODO: Ensure deadlocks can't happen + let (scheme, number, descs_to_send) = { + let (scheme, number) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + let file_descriptor = context + .get_file(socket, &mut token) + .ok_or(Error::new(EBADF))?; + let desc = &file_descriptor.description.read(token.token()); + (desc.scheme, desc.number) + }; + let scheme = scheme::get_scheme(token.token(), scheme)?; + + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + ( + scheme, + number, + if flags.contains(CallFlags::FD_CLONE) { + context.bulk_get_files(&target_fds, &mut token) + } else { + context.bulk_remove_files(&target_fds, &mut token) + }? + .into_iter() + .map(|f| f.description) + .collect(), + ) + }; + + // Inform the scheme whether there are still references to the file description to be sent, + // either in the current file table or in other file tables, regardless of whether EXCLUSIVE is + // requested. + let flags_to_scheme = if flags.contains(CallFlags::FD_EXCLUSIVE) { + for desc in &descs_to_send { + if Arc::strong_count(desc) > 1 { + return Err(Error::new(EBUSY)); + } + } + + CallFlags::FD_EXCLUSIVE + } else { + CallFlags::empty() + }; + + scheme.kfdwrite(number, descs_to_send, flags_to_scheme, arg, metadata, token) +} + +fn call_fdread( + fd: FileHandle, + payload: UserSliceRw, + flags: CallFlags, + metadata: &[u64], + token: &mut CleanLockToken, +) -> Result { + let (scheme, number) = { + let (scheme, number) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + let file_descriptor = context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?; + let desc = file_descriptor.description.read(token.token()); + (desc.scheme, desc.number) + }; + let scheme = scheme::get_scheme(token.token(), scheme)?; + + (scheme, number) + }; + + scheme.kfdread(number, payload, flags, metadata, token) +} + +pub fn sendfd( + socket: FileHandle, + fd: FileHandle, + flags_raw: usize, + arg: u64, + token: &mut CleanLockToken, +) -> Result { + let sendfd_flags = SendFdFlags::from_bits(flags_raw).ok_or(Error::new(EINVAL))?; + let mut call_flags = CallFlags::FD | CallFlags::WRITE; + if sendfd_flags.contains(SendFdFlags::CLONE) { + call_flags |= CallFlags::FD_CLONE; + } + if sendfd_flags.contains(SendFdFlags::EXCLUSIVE) { + call_flags |= CallFlags::FD_EXCLUSIVE; + } + fdwrite_inner(socket, Vec::from([fd]), call_flags, arg, &[], token) +} + +/// File descriptor controls +pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken) -> Result { + let file = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + context.get_file(fd, &mut token) + } + .ok_or(Error::new(EBADF))?; + + let (scheme_id, number, flags) = { + let desc = file.description.write(token.token()); + (desc.scheme, desc.number, desc.flags) + }; + + if cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC { + // Not in match because 'files' cannot be locked + let new_file = duplicate_file(fd, UserSlice::empty(), cmd == F_DUPFD_CLOEXEC, token)?; + + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + return context + .add_file_min(new_file, arg, &mut token) + .ok_or(Error::new(EMFILE)) + .map(FileHandle::into); + } + + // Communicate fcntl with scheme + if cmd != F_GETFD && cmd != F_SETFD { + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + + scheme.fcntl(number, cmd, arg, token)?; + }; + + // Perform kernel operation if scheme agrees + { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + + let mut files = context.files.write(token.token()); + let (files, mut token) = files.token_split(); + match *files.get_mut(fd.get()).ok_or(Error::new(EBADF))? { + Some(ref mut file) => match cmd { + F_GETFD => { + if file.cloexec { + Ok(O_CLOEXEC) + } else { + Ok(0) + } + } + F_SETFD => { + file.cloexec = arg & O_CLOEXEC == O_CLOEXEC; + Ok(0) + } + F_GETFL => Ok(flags as usize), + F_SETFL => { + let new_flags = (flags & O_ACCMODE as u32) | (arg as u32 & !O_ACCMODE as u32); + file.description.write(token.token()).flags = new_flags; + Ok(0) + } + _ => Err(Error::new(EINVAL)), + }, + None => Err(Error::new(EBADF)), + } + } +} + +pub fn flink(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken) -> Result<()> { + let (caller_ctx, file) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + ( + context.caller_ctx(), + context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?, + ) + }; + + /* + let mut path_buf = BorrowedHtBuf::head()?; + let path = path_buf.use_for_string(raw_path)?; + */ + let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?; + let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?; + let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?; + + let (number, scheme_id) = { + let desc = file.description.read(token.token()); + (desc.number, desc.scheme) + }; + + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + + // TODO: Check EXDEV. + /* + if scheme_id != description.scheme { + return Err(Error::new(EXDEV)); + } + */ + + scheme.flink(number, reference.as_ref(), caller_ctx, token) +} + +pub fn frename(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken) -> Result<()> { + let (caller_ctx, file) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + ( + context.caller_ctx(), + context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?, + ) + }; + + /* + let mut path_buf = BorrowedHtBuf::head()?; + let path = path_buf.use_for_string(raw_path)?; + */ + let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?; + let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?; + let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?; + + let (number, scheme_id) = { + let desc = file.description.read(token.token()); + (desc.number, desc.scheme) + }; + + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + + // TODO: Check EXDEV. + /* + if scheme_id != description.scheme { + return Err(Error::new(EXDEV)); + } + */ + + scheme.frename(number, reference.as_ref(), caller_ctx, token) +} + +/// File status +pub fn fstat(fd: FileHandle, user_buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + file_op_generic_ext(fd, token, |scheme, _, desc, token| { + scheme.kfstat(desc.number, user_buf, token)?; + + // TODO: Ensure only the kernel can access the stat when st_dev is set, or use another API + // for retrieving the scheme ID from a file descriptor. + // TODO: Less hacky method. + let st_dev = desc + .scheme + .get() + .try_into() + .map_err(|_| Error::new(EOVERFLOW))?; + user_buf + .advance(core::mem::offset_of!(Stat, st_dev)) + .and_then(|b| b.limit(8)) + .ok_or(Error::new(EIO))? + .copy_from_slice(&u64::to_ne_bytes(st_dev))?; + + Ok(()) + }) +} + +pub fn funmap(virtual_address: usize, length: usize, token: &mut CleanLockToken) -> Result { + // Partial lengths in funmap are allowed according to POSIX, but not particularly meaningful; + // since the memory needs to SIGSEGV if later read, the entire page needs to disappear. + // + // Thus, while (temporarily) allowing unaligned lengths for compatibility, aligning the length + // should be done by libc. + + let length_aligned = length.next_multiple_of(PAGE_SIZE); + if length != length_aligned { + warn!( + "funmap passed length {:#x} instead of {:#x}", + length, length_aligned + ); + } + + let addr_space = Arc::clone(context::current().read(token.token()).addr_space()?); + let span = PageSpan::validate_nonempty(VirtualAddress::new(virtual_address), length_aligned) + .ok_or(Error::new(EINVAL))?; + let unpin = false; + let notify = addr_space.munmap(span, unpin, token)?; + + for map in notify { + let _ = map.unmap(token); + } + + Ok(0) +} + +pub fn mremap( + old_address: usize, + old_size: usize, + new_address: usize, + new_size: usize, + flags: usize, + token: &mut CleanLockToken, +) -> Result { + if !old_address.is_multiple_of(PAGE_SIZE) + || !old_size.is_multiple_of(PAGE_SIZE) + || !new_address.is_multiple_of(PAGE_SIZE) + || !new_size.is_multiple_of(PAGE_SIZE) + { + return Err(Error::new(EINVAL)); + } + if old_size == 0 || new_size == 0 { + return Err(Error::new(EINVAL)); + } + + let old_base = Page::containing_address(VirtualAddress::new(old_address)); + let new_base = Page::containing_address(VirtualAddress::new(new_address)); + + let mremap_flags = MremapFlags::from_bits_truncate(flags); + let prot_flags = MapFlags::from_bits_truncate(flags) + & (MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::PROT_EXEC); + + let map_flags = if mremap_flags.contains(MremapFlags::FIXED_REPLACE) { + MapFlags::MAP_FIXED + } else if mremap_flags.contains(MremapFlags::FIXED) { + MapFlags::MAP_FIXED_NOREPLACE + } else { + MapFlags::empty() + } | prot_flags; + + let addr_space = AddrSpace::current()?; + let src_span = PageSpan::new(old_base, old_size.div_ceil(PAGE_SIZE)); + let new_page_count = new_size.div_ceil(PAGE_SIZE); + let fixed = map_flags.contains(MapFlags::MAP_FIXED) + || map_flags.contains(MapFlags::MAP_FIXED_NOREPLACE); + let requested_dst_base = (new_address != 0 || fixed).then_some(new_base); + + if mremap_flags.contains(MremapFlags::KEEP_OLD) { + // TODO: This is a hack! Find a better interface for replacing this, perhaps a capability + // for non-CoW-borrowed i.e. owned frames, that can be inserted into address spaces. + if new_page_count != 1 { + return Err(Error::new(EOPNOTSUPP)); + } + + let raii_frame = addr_space.borrow_frame_enforce_rw_allocated(src_span.base, token)?; + + let mut token = token.token(); + let base = addr_space.acquire_write(token.downgrade()).mmap( + &addr_space, + requested_dst_base, + NonZeroUsize::new(1).expect("value specified is not zero"), + map_flags, + None, + |page, page_flags, mapper, flusher| { + let frame = raii_frame.take(); + // XXX: add_ref(RefKind::Shared) is internally done by borrow_frame_enforce_rw_allocated(src_span.base). + // The page does not get unref-ed as we call take() on the `raii_frame`. + unsafe { + mapper + .map_phys(page.start_address(), frame.base(), page_flags) + .ok_or(Error::new(ENOMEM))? + .ignore(); + + flusher.queue(frame, None, TlbShootdownActions::NEW_MAPPING); + } + + Ok(Grant::allocated_one_page_nomap(page, page_flags)) + }, + )?; + + Ok(base.start_address().data()) + } else { + let base = addr_space.r#move( + None, + src_span, + requested_dst_base, + new_page_count, + map_flags, + None, + token.downgrade(), + )?; + + Ok(base.start_address().data()) + } +} + +pub fn lseek(fd: FileHandle, pos: i64, whence: usize, token: &mut CleanLockToken) -> Result { + let (fsize, desc) = file_op_generic_ext(fd, token, |scheme, desc_arc, desc, token| { + Ok(if whence == SEEK_END { + (Some(scheme.fsize(desc.number, token)?), desc_arc) + } else { + (None, desc_arc) + }) + })?; + + let mut guard = desc.write(token.token()); + + let new_pos = match whence { + SEEK_SET => pos, + SEEK_CUR => pos + .checked_add_unsigned(guard.offset) + .ok_or(Error::new(EOVERFLOW))?, + SEEK_END => pos + .checked_add_unsigned(fsize.expect("fsize not None as whence is SEEK_END")) + .ok_or(Error::new(EOVERFLOW))?, + _ => return Err(Error::new(EINVAL)), + }; + guard.offset = new_pos.try_into().map_err(|_| Error::new(EINVAL))?; + + Ok(guard.offset as usize) +} +pub fn sys_read(fd: FileHandle, buf: UserSliceWo, token: &mut CleanLockToken) -> Result { + let (bytes_read, desc_arc, desc) = + file_op_generic_ext(fd, token, |scheme, desc_arc, desc, token| { + let offset = if desc.internal_flags.contains(InternalFlags::POSITIONED) { + desc.offset + } else { + u64::MAX + }; + Ok(( + scheme.kreadoff(desc.number, buf, offset, desc.flags, desc.flags, token)?, + desc_arc, + desc, + )) + })?; + if desc.internal_flags.contains(InternalFlags::POSITIONED) { + let offset = &mut desc_arc.write(token.token()).offset; + *offset = offset.saturating_add(bytes_read as u64) + } + Ok(bytes_read) +} +pub fn sys_write(fd: FileHandle, buf: UserSliceRo, token: &mut CleanLockToken) -> Result { + let (bytes_written, desc_arc, desc) = + file_op_generic_ext(fd, token, |scheme, desc_arc, desc, token| { + let offset = if desc.internal_flags.contains(InternalFlags::POSITIONED) { + desc.offset + } else { + u64::MAX + }; + Ok(( + scheme.kwriteoff(desc.number, buf, offset, desc.flags, desc.flags, token)?, + desc_arc, + desc, + )) + })?; + if desc.internal_flags.contains(InternalFlags::POSITIONED) { + let offset = &mut desc_arc.write(token.token()).offset; + *offset = offset.saturating_add(bytes_written as u64) + } + Ok(bytes_written) +} diff --git a/src/syscall/futex.rs b/src/syscall/futex.rs new file mode 100644 index 0000000000..4c187b8ec8 --- /dev/null +++ b/src/syscall/futex.rs @@ -0,0 +1,222 @@ +//! # Futex +//! Futex or Fast Userspace Mutex is "a method for waiting until a certain condition becomes true." +//! +//! For more information about futexes, please read [this](https://eli.thegreenplace.net/2018/basics-of-futexes/) blog post, and the [futex(2)](http://man7.org/linux/man-pages/man2/futex.2.html) man page +use alloc::{ + sync::{Arc, Weak}, + vec::Vec, +}; +use core::sync::atomic::{AtomicU32, Ordering}; +use hashbrown::{hash_map::DefaultHashBuilder, HashMap}; +use rmm::Arch; +use syscall::EINTR; + +use crate::{ + context::{ + self, + memory::{AddrSpace, AddrSpaceWrapper}, + ContextLock, + }, + memory::{Page, PhysicalAddress, VirtualAddress}, + sync::{CleanLockToken, Mutex, L1}, + time, +}; + +use crate::syscall::{ + data::TimeSpec, + error::{Error, Result, EAGAIN, EFAULT, EINVAL, ETIMEDOUT}, + flag::{FUTEX_WAIT, FUTEX_WAIT64, FUTEX_WAKE}, +}; + +use super::usercopy::UserSlice; + +// Physical address used as key, required if synchronizing across address spaces +// (necessitates MAP_SHARED since CoW would invalidate this address). +type FutexList = HashMap>; + +pub struct FutexEntry { + // Virtual address, required if synchronizing across the same address space, if the memory is + // CoW. + // TODO: FUTEX_REQUEUE + target_virtaddr: VirtualAddress, + // Context to wake up, and compare address spaces. + context_lock: Arc, + // address space to check against if virt matches but not phys + addr_space: Weak, +} + +// TODO: Process-private futexes? In that case, put the futex table in each AddrSpace, or just +// implement that fully in userspace. Although futex is probably the best API for process-shared +// POSIX synchronization primitives, a local hash table and wait-for-thread kernel APIs (e.g. +// lwp_park/lwp_unpark from NetBSD) could be a simpler replacement. +static FUTEXES: Mutex = + Mutex::new(FutexList::with_hasher(DefaultHashBuilder::new())); + +fn validate_and_translate_virt(space: &AddrSpace, addr: VirtualAddress) -> Option { + // TODO: Move this elsewhere! + if addr.data().saturating_add(size_of::()) >= crate::USER_END_OFFSET { + return None; + } + + let page = Page::containing_address(addr); + let off = addr.data() - page.start_address().data(); + + let (frame, _) = space.table.utable.translate(page.start_address())?; + + Some(frame.add(off)) +} + +pub fn futex( + addr: usize, + op: usize, + val: usize, + val2: usize, + _addr2: usize, + token: &mut CleanLockToken, +) -> Result { + let current_addrsp = AddrSpace::current()?; + + // Keep the address space locked so we can safely read from the physical address. Unlock it + // before context switching. + let addr_space_guard = current_addrsp.acquire_read(token.downgrade()); + + let target_virtaddr = VirtualAddress::new(addr); + let target_physaddr = validate_and_translate_virt(&addr_space_guard, target_virtaddr) + .ok_or(Error::new(EFAULT))?; + + match op { + // TODO: FUTEX_WAIT_MULTIPLE? + FUTEX_WAIT | FUTEX_WAIT64 => { + let timeout_opt = UserSlice::ro(val2, size_of::())? + .none_if_null() + .map(|buf| unsafe { buf.read_exact::() }) + .transpose()?; + + let context_lock = context::current(); + + { + // TODO: Lock ordering violation + let mut token = unsafe { CleanLockToken::new() }; + let mut futexes = FUTEXES.lock(token.token()); + let (futexes, mut token) = futexes.token_split(); + + let (fetched, expected) = if op == FUTEX_WAIT { + // Must be aligned, otherwise it could cross a page boundary and mess up the + // (simpler) validation we did in the first place. + if !addr.is_multiple_of(4) { + return Err(Error::new(EINVAL)); + } + + // On systems where virtual memory is not abundant, we might instead add an + // atomic usercopy function. + let accessible_addr = crate::memory::RmmA::phys_to_virt(target_physaddr).data(); + + ( + u64::from(unsafe { + (*(accessible_addr as *const AtomicU32)).load(Ordering::SeqCst) + }), + u64::from(val as u32), + ) + } else { + #[cfg(target_has_atomic = "64")] + { + use core::sync::atomic::AtomicU64; + + // op == FUTEX_WAIT64 + if !addr.is_multiple_of(8) { + return Err(Error::new(EINVAL)); + } + ( + unsafe { (*(addr as *const AtomicU64)).load(Ordering::SeqCst) }, + val as u64, + ) + } + #[cfg(not(target_has_atomic = "64"))] + { + return Err(Error::new(crate::syscall::error::EOPNOTSUPP)); + } + }; + if fetched != expected { + return Err(Error::new(EAGAIN)); + } + + { + let mut context = context_lock.write(token.token()); + + context.wake = timeout_opt.map(|TimeSpec { tv_sec, tv_nsec }| { + tv_sec as u128 * time::NANOS_PER_SEC + tv_nsec as u128 + }); + if let Some((tctl, pctl, _)) = context.sigcontrol() + && tctl.currently_pending_unblocked(pctl) != 0 + { + return Err(Error::new(EINTR)); + } + + context.block("futex"); + } + + futexes + .entry(target_physaddr) + .or_insert_with(Vec::new) + .push(FutexEntry { + target_virtaddr, + context_lock: context_lock.clone(), + addr_space: Arc::downgrade(¤t_addrsp), + }); + } + + drop(addr_space_guard); + + context::switch(token); + + let context = context_lock.read(token.token()); + + // The scheduler clears `wake` on timeout. Hence if a timeout was + // set and `wake` is now `None`, we timed out. + if context.wake.is_none() && timeout_opt.is_some() { + Err(Error::new(ETIMEDOUT)) + } else { + Ok(0) + } + } + FUTEX_WAKE => { + let mut woken = 0; + + { + drop(addr_space_guard); + let mut futexes_map = FUTEXES.lock(token.token()); + let (futexes_map, mut token) = futexes_map.token_split(); + + let is_empty = if let Some(futexes) = futexes_map.get_mut(&target_physaddr) { + let mut i = 0; + let current_addrsp_weak = Arc::downgrade(¤t_addrsp); + + // TODO: Use something like retain, once it is possible to tell it when to stop iterating... + while i < futexes.len() && woken < val { + // SAFETY: already verified index is less than length + let futex = unsafe { futexes.get_unchecked_mut(i) }; + if futex.target_virtaddr != target_virtaddr + || !current_addrsp_weak.ptr_eq(&futex.addr_space) + { + i += 1; + continue; + } + futex.context_lock.write(token.token()).unblock(); + futexes.swap_remove(i); + woken += 1; + } + + futexes.is_empty() + } else { + false + }; + if is_empty { + futexes_map.remove(&target_physaddr); + } + } + + Ok(woken) + } + _ => Err(Error::new(EINVAL)), + } +} diff --git a/src/syscall/mod.rs b/src/syscall/mod.rs new file mode 100644 index 0000000000..450a9d112f --- /dev/null +++ b/src/syscall/mod.rs @@ -0,0 +1,258 @@ +//! +//! This module provides syscall definitions and the necessary resources to parse incoming +//! syscalls + +extern crate syscall; + +use syscall::{dirent::DirentHeader, CallFlags, RwFlags, EINVAL}; + +pub use self::syscall::{ + data, error, flag, io, number, ptrace_event, EnvRegisters, FloatRegisters, IntRegisters, +}; + +pub use self::{fs::*, futex::futex, process::*, time::*, usercopy::validate_region}; + +use self::{ + data::{Map, TimeSpec}, + debug::{debug_end, debug_start}, + error::{Error, Result, ENOSYS}, + flag::{EventFlags, MapFlags}, + number::*, + usercopy::UserSlice, +}; + +use crate::{ + context::memory::AddrSpace, + percpu::PercpuBlock, + scheme::{memory::MemoryScheme, FileHandle}, + sync::CleanLockToken, +}; + +/// Debug +pub mod debug; + +/// Filesystem syscalls +pub mod fs; + +/// Fast userspace mutex +pub mod futex; + +/// Process syscalls +pub mod process; + +/// Time syscalls +pub mod time; + +/// Safely copying memory between user and kernel memory +pub mod usercopy; + +/// This function is the syscall handler of the kernel, it is composed of an inner function that returns a `Result`. After the inner function runs, the syscall +/// function calls [`Error::mux`] on it. +#[must_use] +pub fn syscall( + a: usize, + b: usize, + c: usize, + d: usize, + e: usize, + f: usize, + g: usize, + token: &mut CleanLockToken, +) -> usize { + #[inline(always)] + fn inner( + a: usize, + b: usize, + c: usize, + d: usize, + e: usize, + f: usize, + g: usize, + token: &mut CleanLockToken, + ) -> Result { + let fd = FileHandle::from(b); + //SYS_* is declared in kernel/syscall/src/number.rs + match a { + SYS_WRITE2 => file_op_generic_ext(fd, token, |scheme, _, desc, token| { + let flags = if f == usize::MAX { + None + } else { + Some( + u32::try_from(f) + .ok() + .and_then(RwFlags::from_bits) + .ok_or(Error::new(EINVAL))?, + ) + }; + scheme.kwriteoff( + desc.number, + UserSlice::ro(c, d)?, + e as u64, + flags.map_or(desc.flags, |f| desc.rw_flags(f)), + desc.flags, + token, + ) + }), + SYS_WRITE => sys_write(fd, UserSlice::ro(c, d)?, token), + SYS_FMAP => { + let addrspace = AddrSpace::current()?; + let map = unsafe { UserSlice::ro(c, d)?.read_exact::()? }; + if b == !0 { + MemoryScheme::fmap_anonymous(&addrspace, &map, false, token) + } else { + file_op_generic(fd, token, |scheme, number, token| { + scheme.kfmap(number, &addrspace, &map, false, token) + }) + } + } + SYS_GETDENTS => { + let header_size = u16::try_from(e).map_err(|_| Error::new(EINVAL))?; + + if usize::from(header_size) != size_of::() { + // TODO: allow? If so, zero_out must be implemented for UserSlice + return Err(Error::new(EINVAL)); + } + + file_op_generic(fd, token, |scheme, number, token| { + scheme.getdents(number, UserSlice::wo(c, d)?, header_size, f as u64, token) + }) + } + SYS_FUTIMENS => file_op_generic(fd, token, |scheme, number, token| { + scheme.kfutimens(number, UserSlice::ro(c, d)?, token) + }), + + SYS_READ2 => file_op_generic_ext(fd, token, |scheme, _, desc, token| { + let flags = if f == usize::MAX { + None + } else { + Some( + u32::try_from(f) + .ok() + .and_then(RwFlags::from_bits) + .ok_or(Error::new(EINVAL))?, + ) + }; + scheme.kreadoff( + desc.number, + UserSlice::wo(c, d)?, + e as u64, + flags.map_or(desc.flags, |f| desc.rw_flags(f)), + desc.flags, + token, + ) + }), + SYS_READ => sys_read(fd, UserSlice::wo(c, d)?, token), + SYS_FPATH => file_op_generic(fd, token, |scheme, number, token| { + scheme.kfpath(number, UserSlice::wo(c, d)?, token) + }), + SYS_FSTAT => fstat(fd, UserSlice::wo(c, d)?, token).map(|()| 0), + SYS_FSTATVFS => file_op_generic(fd, token, |scheme, number, token| { + scheme + .kfstatvfs(number, UserSlice::wo(c, d)?, token) + .map(|()| 0) + }), + + SYS_DUP => dup(fd, UserSlice::ro(c, d)?, token).map(FileHandle::into), + SYS_DUP2 => { + dup2(fd, FileHandle::from(c), UserSlice::ro(d, e)?, token).map(FileHandle::into) + } + + #[cfg(target_pointer_width = "32")] + SYS_SENDFD => sendfd( + fd, + FileHandle::from(c), + d, + e as u64 | ((f as u64) << 32), + token, + ), + + #[cfg(target_pointer_width = "64")] + SYS_SENDFD => sendfd(fd, FileHandle::from(c), d, e as u64, token), + + SYS_LSEEK => lseek(fd, c as i64, d, token), + SYS_FCHMOD => file_op_generic(fd, token, |scheme, number, token| { + scheme.fchmod(number, c as u16, token).map(|()| 0) + }), + SYS_FCHOWN => file_op_generic(fd, token, |scheme, number, token| { + scheme.fchown(number, c as u32, d as u32, token).map(|()| 0) + }), + SYS_FCNTL => fcntl(fd, c, d, token), + SYS_FEVENT => file_op_generic(fd, token, |scheme, number, token| { + Ok(scheme + .fevent(number, EventFlags::from_bits_truncate(c), token)? + .bits()) + }), + SYS_FLINK => flink(fd, UserSlice::ro(c, d)?, token).map(|()| 0), + SYS_FRENAME => frename(fd, UserSlice::ro(c, d)?, token).map(|()| 0), + SYS_FUNMAP => funmap(b, c, token), + + SYS_FSYNC => file_op_generic(fd, token, |scheme, number, token| { + scheme.fsync(number, token).map(|()| 0) + }), + // TODO: 64-bit lengths on 32-bit platforms + SYS_FTRUNCATE => file_op_generic(fd, token, |scheme, number, token| { + scheme.ftruncate(number, c, token).map(|()| 0) + }), + + SYS_CLOSE => close(fd, token).map(|()| 0), + SYS_CALL => call( + fd, + UserSlice::rw(c, d)?, + CallFlags::from_bits(e & !0xff).ok_or(Error::new(EINVAL))?, + UserSlice::ro(f, (e & 0xff) * 8)?, + token, + ), + SYS_OPENAT => { + openat(fd, UserSlice::ro(c, d)?, e, f as _, 0, 0, token).map(FileHandle::into) + } + SYS_OPENAT_WITH_FILTER => openat( + fd, + UserSlice::ro(c, d)?, + e, + (e & syscall::O_FCNTL_MASK) as _, + f as _, + g as _, + token, + ) + .map(FileHandle::into), + SYS_UNLINKAT => unlinkat(fd, UserSlice::ro(c, d)?, e, 0, 0, token).map(|()| 0), + SYS_UNLINKAT_WITH_FILTER => { + unlinkat(fd, UserSlice::ro(c, d)?, e, f as _, g as _, token).map(|()| 0) + } + SYS_YIELD => sched_yield(token).map(|()| 0), + SYS_NANOSLEEP => nanosleep( + UserSlice::ro(b, size_of::())?, + UserSlice::wo(c, size_of::())?.none_if_null(), + token, + ) + .map(|()| 0), + SYS_CLOCK_GETTIME => { + clock_gettime(b, UserSlice::wo(c, size_of::())?, token).map(|()| 0) + } + SYS_FUTEX => futex(b, c, d, e, f, token), + + SYS_MPROTECT => mprotect(b, c, MapFlags::from_bits_truncate(d), token).map(|()| 0), + SYS_MREMAP => mremap(b, c, d, e, f, token), + + _ => Err(Error::new(ENOSYS)), + } + } + + PercpuBlock::current().inside_syscall.set(true); + + debug_start([a, b, c, d, e, f, g], token); + + let result = inner(a, b, c, d, e, f, g, token); + + debug_end([a, b, c, d, e, f, g], result, token); + + let percpu = PercpuBlock::current(); + percpu.inside_syscall.set(false); + + if percpu.switch_internals.being_sigkilled.get() { + exit_this_context(None, token); + } + + // errormux turns Result into -errno + Error::mux(result) +} diff --git a/src/syscall/process.rs b/src/syscall/process.rs new file mode 100644 index 0000000000..e83da427b4 --- /dev/null +++ b/src/syscall/process.rs @@ -0,0 +1,294 @@ +use alloc::sync::Arc; +use core::{mem, num::NonZeroUsize}; + +use rmm::Arch; +use syscall::data::GlobalSchemes; + +use crate::{ + context::{ + context::SyscallFrame, + file::{FileDescription, FileDescriptor, InternalFlags}, + memory::{AddrSpace, Grant, PageSpan}, + ContextRef, + }, + event, + sync::{CleanLockToken, RwLock}, + syscall::flag::{EventFlags, O_CREAT, O_RDWR}, +}; + +use crate::{ + context::{self, context::FdTbl}, + memory::{Page, VirtualAddress, PAGE_SIZE}, + scheme::{ + KernelScheme, SchemeExt, SchemeId, SchemeList, ALL_KERNEL_SCHEMES, KERNEL_SCHEMES_COUNT, + }, + startup::Bootstrap, + syscall::{error::*, flag::MapFlags}, + CurrentRmmArch, +}; + +use super::usercopy::UserSliceWo; + +pub fn exit_this_context(excp: Option, token: &mut CleanLockToken) -> ! { + let mut close_files; + let addrspace_opt; + + let context_lock = context::current(); + { + let mut context = context_lock.write(token.token()); + // let (context, mut token) = context.token_split(); + close_files = Arc::try_unwrap(mem::take(&mut context.files)) + .map_or_else(|_| FdTbl::new(), RwLock::into_inner); + // TODO: Lock ordering violation + let mut token = unsafe { CleanLockToken::new() }; + addrspace_opt = context + .set_addr_space(None, token.downgrade()) + .and_then(|a| Arc::try_unwrap(a).ok()); + drop(mem::replace(&mut context.syscall_head, SyscallFrame::Dummy)); + drop(mem::replace(&mut context.syscall_tail, SyscallFrame::Dummy)); + } + + // Files must be closed while context is valid so that messages can be passed + close_files.force_close_all(token); + if let Some(addrspace) = addrspace_opt { + addrspace.into_drop(token); + } + // TODO: Should status == Status::HardBlocked be handled differently? + let owner = { + let mut guard = context_lock.write(token.token()); + guard.status = context::Status::Dead { excp }; + guard.owner_proc_id + }; + if let Some(owner) = owner { + event::trigger( + GlobalSchemes::Proc.scheme_id(), + owner.get(), + EventFlags::EVENT_READ, + token, + ); + } + { + if !context::contexts_mut(token.downgrade()).remove(&ContextRef(context_lock)) { + #[cfg(feature = "drop_panic")] + { + panic!("This context is not in the cpu") + } + } + } + context::switch(token); + unreachable!(); +} + +pub fn mprotect( + address: usize, + size: usize, + flags: MapFlags, + token: &mut CleanLockToken, +) -> Result<()> { + // println!("mprotect {:#X}, {}, {:#X}", address, size, flags); + + let span = PageSpan::validate_nonempty(VirtualAddress::new(address), size) + .ok_or(Error::new(EINVAL))?; + + AddrSpace::current()?.mprotect(span, flags, token) +} + +const KERNEL_METADATA_BASE: usize = crate::USER_END_OFFSET - syscall::KERNEL_METADATA_SIZE; +const KERNEL_METADATA_PAGE_COUNT: usize = syscall::KERNEL_METADATA_SIZE / PAGE_SIZE + { + if syscall::KERNEL_METADATA_SIZE.is_multiple_of(PAGE_SIZE) { + 0 + } else { + 1 + } +}; + +pub unsafe fn usermode_bootstrap(bootstrap: &Bootstrap, token: &mut CleanLockToken) { + assert_ne!(bootstrap.page_count, 0); + + { + let addr_space = Arc::clone( + context::current() + .read(token.token()) + .addr_space() + .expect("expected bootstrap context to have an address space"), + ); + + let base = Page::containing_address(VirtualAddress::new(PAGE_SIZE)); + let flags = MapFlags::MAP_FIXED_NOREPLACE + | MapFlags::PROT_EXEC + | MapFlags::PROT_READ + | MapFlags::PROT_WRITE; + + let page_count = + NonZeroUsize::new(bootstrap.page_count).expect("bootstrap contained no pages!"); + + let _base_page = { + let mut lock_token = token.token(); + let mut addr_space_lock = addr_space.acquire_write(lock_token.downgrade()); + addr_space_lock + .mmap( + &addr_space, + Some(base), + page_count, + flags, + None, + |page, flags, mapper, flusher| { + let shared = false; + Ok(Grant::zeroed( + PageSpan::new(page, bootstrap.page_count), + flags, + mapper, + flusher, + shared, + )?) + }, + ) + .expect("Failed to allocate bootstrap pages") + }; + + // Insert kernel schemes root capabilities. + let mut kernel_schemes_infos = + [syscall::data::KernelSchemeInfo::default(); KERNEL_SCHEMES_COUNT]; + assert_eq!(kernel_schemes_infos.len(), ALL_KERNEL_SCHEMES.len()); + for (i, scheme) in ALL_KERNEL_SCHEMES.iter().enumerate() { + if let Some(inner) = kernel_schemes_infos.get_mut(i) { + inner.scheme_id = scheme.scheme_id().get() as u8; + inner.fd = { + let cap_fd = match scheme.as_scheme().scheme_root(token) { + Ok(fd) => fd, + Err(_) => usize::MAX, + }; + insert_fd( + scheme.scheme_id(), + cap_fd, + matches!(scheme, GlobalSchemes::Proc), + token, + ) + }; + } + } + // Insert a scheme creation capability for the usermode bootstrap. + let scheme_creation_cap = { + // First, get the scheme root to initialize the schemelist. + let cap_fd = match &SchemeList.scheme_root(token) { + Ok(fd) => *fd, + Err(_) => usize::MAX, + }; + // Second, retrieve the scheme ID. + let scheme_id = &SchemeList.id(); + insert_fd(*scheme_id, cap_fd, false, token) + }; + + let mut lock_token = token.token(); + let kernel_schemes_info_page = addr_space + .acquire_write(lock_token.downgrade()) + .mmap( + &addr_space, + Some(Page::containing_address(VirtualAddress::new( + KERNEL_METADATA_BASE, + ))), + NonZeroUsize::new(KERNEL_METADATA_PAGE_COUNT).unwrap(), + MapFlags::MAP_FIXED_NOREPLACE | MapFlags::PROT_READ | MapFlags::PROT_WRITE, + None, + |page, flags, mapper, flusher| { + let shared = false; + Ok(Grant::zeroed( + PageSpan::new(page, KERNEL_METADATA_PAGE_COUNT), + flags, + mapper, + flusher, + shared, + )?) + }, + ) + .expect("Failed to allocate kernel scheme info page"); + + let mut cursor = kernel_schemes_info_page.start_address().data(); + const HEADER_SIZE: usize = size_of::(); + UserSliceWo::new(cursor, HEADER_SIZE) + .expect("failed to create kernel schemes header user slice") + .copy_common_bytes_from_slice(&KERNEL_SCHEMES_COUNT.to_ne_bytes()) + .expect("failed to copy kernel schemes count"); + cursor += HEADER_SIZE; + let info_bytes = unsafe { + core::slice::from_raw_parts( + kernel_schemes_infos.as_ptr() as *const u8, + KERNEL_SCHEMES_COUNT * size_of::(), + ) + }; + UserSliceWo::new( + cursor, + KERNEL_SCHEMES_COUNT * size_of::(), + ) + .expect("failed to create kernel schemes info user slice") + .copy_common_bytes_from_slice(info_bytes) + .expect("failed to copy kernel schemes info"); + cursor += KERNEL_SCHEMES_COUNT * size_of::(); + UserSliceWo::new(cursor, size_of::()) + .expect("failed to create scheme creation cap user slice") + .copy_common_bytes_from_slice(&scheme_creation_cap.to_ne_bytes()) + .expect("failed to copy scheme creation cap"); + + mprotect( + KERNEL_METADATA_BASE, + KERNEL_METADATA_PAGE_COUNT * PAGE_SIZE, + MapFlags::PROT_READ, + token, + ) + .expect("failed to mprotect kernel schemes info page"); + } + + let bootstrap_slice = unsafe { bootstrap_mem(bootstrap) }; + UserSliceWo::new(PAGE_SIZE, bootstrap.page_count * PAGE_SIZE) + .expect("failed to create bootstrap user slice") + .copy_from_slice(bootstrap_slice) + .expect("failed to copy memory to bootstrap"); + + let bootstrap_entry = u64::from_le_bytes(bootstrap_slice[0x1a..0x22].try_into().unwrap()); + debug!("Bootstrap entry point: {:X}", bootstrap_entry); + assert_ne!(bootstrap_entry, 0); + + // Start in a minimal environment without any stack. + + let ctx = context::current(); + let mut lock = ctx.write(token.token()); + let regs = &mut lock + .regs_mut() + .expect("bootstrap needs registers to be available"); + { + regs.init(); + regs.set_instr_pointer(bootstrap_entry.try_into().unwrap()); + } +} + +unsafe fn bootstrap_mem(bootstrap: &crate::startup::Bootstrap) -> &'static [u8] { + unsafe { + core::slice::from_raw_parts( + CurrentRmmArch::phys_to_virt(bootstrap.base.base()).data() as *const u8, + bootstrap.page_count * PAGE_SIZE, + ) + } +} + +fn insert_fd(scheme: SchemeId, number: usize, cloexec: bool, token: &mut CleanLockToken) -> usize { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + context + .add_file_min( + FileDescriptor { + description: Arc::new(RwLock::new(FileDescription { + scheme, + number, + offset: 0, + flags: (O_CREAT | O_RDWR) as u32, + internal_flags: InternalFlags::empty(), + })), + cloexec, + }, + syscall::flag::UPPER_FDTBL_TAG + scheme.get(), + &mut token, + ) + .expect("failed to insert fd to current context") + .get() +} diff --git a/src/syscall/time.rs b/src/syscall/time.rs new file mode 100644 index 0000000000..aa6763f5b0 --- /dev/null +++ b/src/syscall/time.rs @@ -0,0 +1,91 @@ +use crate::{ + context, + sync::CleanLockToken, + syscall::{ + data::TimeSpec, + error::*, + flag::{CLOCK_MONOTONIC, CLOCK_REALTIME}, + }, + time, +}; + +use super::usercopy::{UserSliceRo, UserSliceWo}; + +pub fn clock_gettime(clock: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { + let arch_time = match clock { + CLOCK_REALTIME => time::realtime(token), + CLOCK_MONOTONIC => time::monotonic(token), + _ => return Err(Error::new(EINVAL)), + }; + + buf.copy_exactly(&TimeSpec { + tv_sec: (arch_time / time::NANOS_PER_SEC) as i64, + tv_nsec: (arch_time % time::NANOS_PER_SEC) as i32, + }) +} + +/// Nanosleep will sleep by switching the current context +pub fn nanosleep( + req_buf: UserSliceRo, + rem_buf_opt: Option, + token: &mut CleanLockToken, +) -> Result<()> { + let req = unsafe { req_buf.read_exact::()? }; + + if req.tv_sec < 0 || req.tv_nsec < 0 || req.tv_nsec >= time::NANOS_PER_SEC as i32 { + return Err(Error::new(EINVAL)); + } + + let start = time::monotonic(token); + let end = start + (req.tv_sec as u128 * time::NANOS_PER_SEC) + (req.tv_nsec as u128); + + let current_context = context::current(); + { + let mut context = current_context.write(token.token()); + + if let Some((tctl, pctl, _)) = context.sigcontrol() + && tctl.currently_pending_unblocked(pctl) != 0 + { + return Err(Error::new(EINTR)); + } + + context.wake = Some(end); + context.block("nanosleep"); + } + + // TODO: The previous wakeup reason was most likely signals, but is there any other possible + // reason? + context::switch(token); + + let was_interrupted = current_context.write(token.token()).wake.take().is_some(); + + if let Some(rem_buf) = rem_buf_opt { + let current = time::monotonic(token); + + rem_buf.copy_exactly(&if current < end { + let diff = end - current; + TimeSpec { + tv_sec: (diff / time::NANOS_PER_SEC) as i64, + tv_nsec: (diff % time::NANOS_PER_SEC) as i32, + } + } else { + TimeSpec { + tv_sec: 0, + tv_nsec: 0, + } + })?; + } + + if was_interrupted { + Err(Error::new(EINTR)) + } else { + Ok(()) + } +} + +pub fn sched_yield(token: &mut CleanLockToken) -> Result<()> { + context::switch(token); + // TODO: Do this check in userspace + context::signal::signal_handler(token); + Ok(()) +} diff --git a/src/syscall/usercopy.rs b/src/syscall/usercopy.rs new file mode 100644 index 0000000000..b32925238a --- /dev/null +++ b/src/syscall/usercopy.rs @@ -0,0 +1,255 @@ +use syscall::dirent::Buffer; + +use crate::{ + context::memory::PageSpan, + memory::{Page, VirtualAddress, PAGE_SIZE}, +}; + +use crate::arch::{arch_copy_from_user, arch_copy_to_user}; + +use crate::syscall::error::{Error, Result, EFAULT, EINVAL}; + +#[derive(Clone, Copy)] +pub struct UserSlice { + base: usize, + len: usize, +} +pub type UserSliceRo = UserSlice; +pub type UserSliceWo = UserSlice; +pub type UserSliceRw = UserSlice; + +impl UserSlice { + pub fn empty() -> Self { + Self { base: 0, len: 0 } + } + pub fn len(&self) -> usize { + self.len + } + pub fn is_empty(&self) -> bool { + self.len == 0 + } + pub fn addr(&self) -> usize { + self.base + } + pub fn new(base: usize, len: usize) -> Result { + if base >= crate::USER_END_OFFSET || base.saturating_add(len) >= crate::USER_END_OFFSET { + return Err(Error::new(EFAULT)); + } + + Ok(Self { base, len }) + } + /// Split [0, end) into [0, idx) and [idx, end) + pub fn split_at(self, idx: usize) -> Option<(Self, Self)> { + if idx > self.len { + return None; + } + Some(( + Self { + base: self.base, + len: idx, + }, + Self { + base: self.base + idx, + len: self.len - idx, + }, + )) + } + pub fn advance(self, by: usize) -> Option { + Some(self.split_at(by)?.1) + } + pub fn limit(self, to: usize) -> Option { + Some(self.split_at(to)?.0) + } + pub fn none_if_null(self) -> Option { + if self.addr() == 0 { + None + } else { + Some(self) + } + } + /// Not unsafe, because user memory is not covered by the memory model that decides if + /// something is UB, but it can break logic invariants + pub fn reinterpret_unchecked( + self, + ) -> UserSlice { + UserSlice { + base: self.base, + len: self.len, + } + } + pub fn in_variable_chunks(self, chunk_size: usize) -> impl Iterator { + (0..self.len()).step_by(chunk_size).map(move |i| { + self.advance(i) + .expect("already limited by length, must succeed") + }) + } + pub fn in_exact_chunks(self, chunk_size: usize) -> impl Iterator { + (0..self.len().div_floor(chunk_size)).map(move |i| { + self.advance(i * chunk_size) + .expect("already limited by length, must succeed") + .limit(chunk_size) + .expect("length is aligned") + }) + } +} +impl UserSlice { + pub fn copy_to_slice(self, slice: &mut [u8]) -> Result<()> { + debug_assert!(is_kernel_mem(slice)); + + if self.len != slice.len() { + return Err(Error::new(EINVAL)); + } + + if unsafe { arch_copy_from_user(slice.as_mut_ptr() as usize, self.base, self.len) } == 0 { + Ok(()) + } else { + Err(Error::new(EFAULT)) + } + } + pub unsafe fn read_exact(self) -> Result { + let mut t: T = unsafe { core::mem::zeroed() }; + let slice = unsafe { + core::slice::from_raw_parts_mut((&mut t as *mut T).cast::(), size_of::()) + }; + + self.limit(size_of::()) + .ok_or(Error::new(EINVAL))? + .copy_to_slice(slice)?; + + Ok(t) + } + pub fn copy_common_bytes_to_slice(self, slice: &mut [u8]) -> Result { + let min = core::cmp::min(self.len(), slice.len()); + self.limit(min) + .expect("min(len, x) is always <= len") + .copy_to_slice(&mut slice[..min])?; + Ok(min) + } + // TODO: Merge int IO functions? + pub fn read_usize(self) -> Result { + let mut ret = 0_usize.to_ne_bytes(); + self.limit(size_of::()) + .ok_or(Error::new(EINVAL))? + .copy_to_slice(&mut ret)?; + Ok(usize::from_ne_bytes(ret)) + } + pub fn read_u32(self) -> Result { + let mut ret = 0_u32.to_ne_bytes(); + self.limit(4) + .ok_or(Error::new(EINVAL))? + .copy_to_slice(&mut ret)?; + Ok(u32::from_ne_bytes(ret)) + } + pub fn usizes(self) -> impl Iterator> { + self.in_exact_chunks(size_of::()) + .map(Self::read_usize) + } +} +impl UserSlice { + pub fn copy_from_slice(self, slice: &[u8]) -> Result<()> { + // A zero sized slice will like have 0x1 as address + debug_assert!(is_kernel_mem(slice) || slice.is_empty()); + + if self.len != slice.len() { + return Err(Error::new(EINVAL)); + } + + if unsafe { arch_copy_to_user(self.base, slice.as_ptr() as usize, self.len) } == 0 { + Ok(()) + } else { + Err(Error::new(EFAULT)) + } + } + pub fn copy_common_bytes_from_slice(self, slice: &[u8]) -> Result { + let min = core::cmp::min(self.len(), slice.len()); + self.limit(min) + .expect("min(len, x) is always <= len") + .copy_from_slice(&slice[..min])?; + Ok(min) + } + pub fn copy_exactly(self, slice: &[u8]) -> Result<()> { + self.limit(slice.len()) + .ok_or(Error::new(EINVAL))? + .copy_from_slice(slice)?; + Ok(()) + } + pub fn write_usize(self, word: usize) -> Result<()> { + self.limit(size_of::()) + .ok_or(Error::new(EINVAL))? + .copy_from_slice(&word.to_ne_bytes())?; + Ok(()) + } + pub fn write_u32(self, int: u32) -> Result<()> { + self.limit(size_of::()) + .ok_or(Error::new(EINVAL))? + .copy_from_slice(&int.to_ne_bytes())?; + Ok(()) + } +} + +impl UserSliceRo { + pub fn ro(base: usize, size: usize) -> Result { + Self::new(base, size) + } +} +impl UserSliceWo { + pub fn wo(base: usize, size: usize) -> Result { + Self::new(base, size) + } +} +impl UserSliceRw { + pub fn rw(base: usize, size: usize) -> Result { + Self::new(base, size) + } + pub fn into_ro(self) -> Result { + UserSliceRo::ro(self.base, self.len) + } + pub fn into_wo(self) -> Result { + UserSliceWo::wo(self.base, self.len) + } +} + +fn is_kernel_mem(slice: &[u8]) -> bool { + (slice.as_ptr() as usize) >= crate::USER_END_OFFSET + && (slice.as_ptr() as usize).checked_add(slice.len()).is_some() +} + +/// Convert `[addr, addr+size)` into `(page, page_count)`. +/// +/// This will fail if: +/// +/// - the base address is not page-aligned, +/// - the length is not page-aligned, +/// - the region is empty (EINVAL), or +/// - any byte in the region exceeds USER_END_OFFSET (EFAULT). +pub fn validate_region(address: usize, size: usize) -> Result { + if !address.is_multiple_of(PAGE_SIZE) || !size.is_multiple_of(PAGE_SIZE) || size == 0 { + return Err(Error::new(EINVAL)); + } + if address.saturating_add(size) > crate::USER_END_OFFSET { + return Err(Error::new(EFAULT)); + } + Ok(PageSpan::new( + Page::containing_address(VirtualAddress::new(address)), + size / PAGE_SIZE, + )) +} +impl Buffer<'static> for UserSliceWo { + fn empty() -> Self { + UserSliceWo::empty() + } + fn length(&self) -> usize { + self.len() + } + fn split_at(self, index: usize) -> Option<[Self; 2]> { + let (a, b) = self.split_at(index)?; + Some([a, b]) + } + fn copy_from_slice_exact(self, src: &[u8]) -> Result<()> { + self.copy_exactly(src) + } + fn zero_out(self) -> Result<()> { + // TODO: Implement this. Don't need to as long as the header size is constant, for now. + Ok(()) + } +} diff --git a/src/time.rs b/src/time.rs new file mode 100644 index 0000000000..72992f1d42 --- /dev/null +++ b/src/time.rs @@ -0,0 +1,36 @@ +use crate::{ + sync::{CleanLockToken, Mutex, RwLock, L1}, + syscall::error::{Error, Result, EINVAL}, +}; + +pub const NANOS_PER_SEC: u128 = 1_000_000_000; + +// TODO: seqlock? +/// Kernel start time, measured in nanoseconds since Unix epoch +pub static START: Mutex = Mutex::new(0); +/// Kernel up time, measured in nanoseconds since `START_TIME` +pub static OFFSET: RwLock = RwLock::new(0); + +pub fn monotonic(token: &mut CleanLockToken) -> u128 { + crate::arch::time::monotonic_absolute(token) +} + +pub fn realtime(token: &mut CleanLockToken) -> u128 { + let start = { *START.lock(token.token()) }; + let offset = { monotonic(token) }; + start + offset +} + +pub fn monotonic_resolution() -> u128 { + crate::arch::time::monotonic_resolution() +} + +pub fn realtime_resolution() -> u128 { + monotonic_resolution() +} + +pub fn sys_update_time_offset(buf: &[u8], token: &mut CleanLockToken) -> Result { + let start = <[u8; 16]>::try_from(buf).map_err(|_| Error::new(EINVAL))?; + *START.lock(token.token()) = u128::from_ne_bytes(start); + Ok(16) +} diff --git a/targets/aarch64-unknown-kernel.json b/targets/aarch64-unknown-kernel.json new file mode 100644 index 0000000000..58b72ffac3 --- /dev/null +++ b/targets/aarch64-unknown-kernel.json @@ -0,0 +1,24 @@ +{ + "abi": "softfloat", + "arch": "aarch64", + "data-layout": "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32", + "disable-redzone": true, + "dynamic-linking": false, + "env": "", + "exe-suffix": "", + "executables": true, + "features": "+strict-align,-neon,-fp-armv8,+tpidr-el1", + "frame-pointer": "always", + "has-rpath": false, + "linker": "rust-lld", + "linker-flavor": "gnu-lld", + "llvm-target": "aarch64-unknown-none", + "no-default-libraries": true, + "os": "none", + "position-independent-executables": false, + "relocation-model": "pic", + "target-c-int-width": 32, + "target-endian": "little", + "target-pointer-width": 64, + "vendor": "unknown" +} diff --git a/targets/i586-unknown-kernel.json b/targets/i586-unknown-kernel.json new file mode 100644 index 0000000000..062e4c76ec --- /dev/null +++ b/targets/i586-unknown-kernel.json @@ -0,0 +1,26 @@ +{ + "arch": "x86", + "code-model": "kernel", + "data-layout": "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128", + "disable-redzone": true, + "dynamic-linking": false, + "env": "", + "exe-suffix": "", + "executables": true, + "features": "-mmx,-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,+soft-float", + "frame-pointer": "always", + "has-rpath": false, + "linker": "rust-lld", + "linker-flavor": "gnu-lld", + "llvm-target": "i686-unknown-none", + "max-atomic-width": 64, + "no-default-libraries": true, + "os": "none", + "position-independent-executables": false, + "relocation-model": "static", + "rustc-abi": "x86-softfloat", + "target-c-int-width": 32, + "target-endian": "little", + "target-pointer-width": 32, + "vendor": "unknown" +} diff --git a/targets/riscv64-unknown-kernel.json b/targets/riscv64-unknown-kernel.json new file mode 100644 index 0000000000..f4c499d61f --- /dev/null +++ b/targets/riscv64-unknown-kernel.json @@ -0,0 +1,24 @@ +{ + "arch": "riscv64", + "data-layout": "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128", + "disable-redzone": true, + "dynamic-linking": false, + "env": "", + "exe-suffix": "", + "executables": true, + "features": "+m,+a,+c,+zihintpause", + "frame-pointer": "always", + "has-rpath": false, + "linker": "rust-lld", + "linker-flavor": "gnu-lld", + "llvm-abiname": "lp64", + "llvm-target": "riscv64-unknown-none", + "no-default-libraries": true, + "os": "none", + "position-independent-executables": false, + "relocation-model": "pic", + "target-c-int-width": 32, + "target-endian": "little", + "target-pointer-width": 64, + "vendor": "unknown" +} diff --git a/targets/x86_64-unknown-kernel.json b/targets/x86_64-unknown-kernel.json new file mode 100644 index 0000000000..cf2f8b892b --- /dev/null +++ b/targets/x86_64-unknown-kernel.json @@ -0,0 +1,25 @@ +{ + "arch": "x86_64", + "code-model": "kernel", + "data-layout": "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", + "disable-redzone": true, + "dynamic-linking": false, + "env": "", + "exe-suffix": "", + "executables": true, + "features": "-mmx,-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,+soft-float", + "frame-pointer": "always", + "has-rpath": false, + "linker": "rust-lld", + "linker-flavor": "gnu-lld", + "llvm-target": "x86_64-unknown-none", + "no-default-libraries": true, + "os": "none", + "position-independent-executables": false, + "relocation-model": "static", + "rustc-abi": "x86-softfloat", + "target-c-int-width": 32, + "target-endian": "little", + "target-pointer-width": 64, + "vendor": "unknown" +}