cee25393d8
- Fix P15-8-init-cycle-detection.patch: replace visiting+error with seen+silent-skip to eliminate 11 false-positive 'dependency cycle detected' errors on shared deps - Fix P0-daemon-fix-init-notify-unwrap.patch: remove eprintln! for missing INIT_NOTIFY (expected for oneshot_async services, ~7 daemons affected) - Fix driver-manager hotplug loop: add PERMANENTLY_SKIPPED static set shared between hotplug handler and DriverConfig::probe() to stop infinite re-probing of Fatal/NotSupported/deferred-exhausted device+driver pairs (e.g. ided) - Fix driver-manager log_timeline: suppress repeated EPIPE/ENOENT errors with AtomicI32 dedup and AtomicBool one-shot guards for boot timeline JSON - Add driver-manager SIGTERM handler, ACPI bus registration, --status mode, driver reap loop, graceful shutdown, and reduced deferred retries (30→3)
327 lines
12 KiB
Diff
327 lines
12 KiB
Diff
--- a/init/src/service.rs
|
|
+++ b/init/src/service.rs
|
|
@@ -40,6 +40,28 @@
|
|
pub inherit_envs: BTreeSet<String>,
|
|
#[serde(rename = "type")]
|
|
pub type_: ServiceType,
|
|
+ /// Restart policy for this service. Default: Never (no restart on exit).
|
|
+ #[serde(default)]
|
|
+ pub restart: RestartPolicy,
|
|
+ /// Maximum consecutive restart attempts before giving up. Default: 3.
|
|
+ #[serde(default = "default_max_restarts")]
|
|
+ pub max_restarts: u32,
|
|
+}
|
|
+
|
|
+fn default_max_restarts() -> u32 {
|
|
+ 3
|
|
+}
|
|
+
|
|
+#[derive(Clone, Debug, Default, Deserialize, PartialEq)]
|
|
+#[serde(rename_all = "kebab-case")]
|
|
+pub enum RestartPolicy {
|
|
+ /// Never restart (default — current behavior).
|
|
+ #[default]
|
|
+ Never,
|
|
+ /// Restart on non-zero exit code.
|
|
+ OnFailure,
|
|
+ /// Restart on any exit (including clean).
|
|
+ Always,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Default, Deserialize)]
|
|
@@ -53,7 +75,9 @@
|
|
}
|
|
|
|
impl Service {
|
|
- pub fn spawn(&self, base_envs: &BTreeMap<String, OsString>) {
|
|
+ /// Returns Some(child_pid) for long-running services (Notify, Scheme),
|
|
+ /// None for Oneshot/OneshotAsync or if the spawn failed.
|
|
+ pub fn spawn(&self, base_envs: &BTreeMap<String, OsString>) -> Option<u32> {
|
|
let mut command = Command::new(&self.cmd);
|
|
command.args(self.args.iter().map(|arg| subst_env(arg)));
|
|
command.env_clear();
|
|
@@ -72,6 +96,7 @@
|
|
status_fail(&format!("failed to execute {:?}: {}", command, err));
|
|
}
|
|
}
|
|
+ None
|
|
}
|
|
_ => {
|
|
let (mut read_pipe, write_pipe) = io::pipe().unwrap();
|
|
@@ -85,7 +110,7 @@
|
|
let _ = unsafe { libc::close(write_raw) };
|
|
drop(read_pipe);
|
|
status_fail(&format!("failed to execute {:?}: {}", command, err));
|
|
- return;
|
|
+ return None;
|
|
}
|
|
};
|
|
|
|
@@ -100,7 +125,7 @@
|
|
));
|
|
let _ = child.kill();
|
|
let _ = child.wait();
|
|
- return;
|
|
+ return None;
|
|
}
|
|
match read_pipe.read_exact(&mut [0]) {
|
|
Ok(()) => {}
|
|
@@ -111,6 +136,7 @@
|
|
init_error(&format!("failed to wait for {:?}: {}", command, err));
|
|
}
|
|
}
|
|
+ Some(child.id())
|
|
}
|
|
ServiceType::Scheme(scheme) => {
|
|
if !poll_fd_timeout(read_pipe.as_raw_fd(), SERVICE_READY_TIMEOUT_SECS) {
|
|
@@ -120,7 +146,7 @@
|
|
));
|
|
let _ = child.kill();
|
|
let _ = child.wait();
|
|
- return;
|
|
+ return None;
|
|
}
|
|
let mut new_fd = usize::MAX;
|
|
loop {
|
|
@@ -135,16 +161,16 @@
|
|
}) => continue,
|
|
Ok(0) => {
|
|
init_warn(&format!("{:?} exited without notifying readiness", command));
|
|
- return;
|
|
+ return None;
|
|
}
|
|
Ok(1) => break,
|
|
Ok(n) => {
|
|
init_error(&format!("incorrect amount of fds {} returned", n));
|
|
- return;
|
|
+ return None;
|
|
}
|
|
Err(err) => {
|
|
init_error(&format!("failed to wait for {:?}: {}", command, err));
|
|
- return;
|
|
+ return None;
|
|
}
|
|
}
|
|
}
|
|
@@ -152,6 +178,7 @@
|
|
let current_namespace_fd = libredox::call::getns().expect("TODO");
|
|
libredox::call::register_scheme_to_ns(current_namespace_fd, scheme, new_fd)
|
|
.expect("TODO");
|
|
+ Some(child.id())
|
|
}
|
|
ServiceType::Oneshot => {
|
|
drop(read_pipe);
|
|
@@ -165,6 +192,7 @@
|
|
init_error(&format!("failed to wait for {:?}: {}", command, err))
|
|
}
|
|
}
|
|
+ None
|
|
}
|
|
ServiceType::OneshotAsync => unreachable!(),
|
|
}
|
|
--- a/init/src/scheduler.rs
|
|
+++ b/init/src/scheduler.rs
|
|
@@ -1,14 +1,28 @@
|
|
-use std::collections::{BTreeSet, VecDeque};
|
|
+use std::collections::{BTreeMap, BTreeSet, VecDeque};
|
|
|
|
use crate::InitConfig;
|
|
-use crate::color::{init_error, status_ok, status_skip};
|
|
+use crate::color::{init_error, init_warn, status_ok, status_skip};
|
|
+use crate::service::RestartPolicy;
|
|
use crate::unit::{Unit, UnitId, UnitKind, UnitStore};
|
|
|
|
const SPAWN_BATCH_SIZE: usize = 50;
|
|
|
|
+/// Tracks the restart state for a supervised service.
|
|
+pub struct ServiceState {
|
|
+ pub unit_id: UnitId,
|
|
+ pub cmd: String,
|
|
+ pub restart_policy: RestartPolicy,
|
|
+ pub max_restarts: u32,
|
|
+ pub restart_count: u32,
|
|
+ /// Monotonic time of last restart (for backoff calculation).
|
|
+ pub last_restart_ms: u64,
|
|
+}
|
|
+
|
|
pub struct Scheduler {
|
|
pending: VecDeque<Job>,
|
|
completed: BTreeSet<UnitId>,
|
|
+ /// Maps child PID → service state for supervised services.
|
|
+ pub supervised: BTreeMap<u32, ServiceState>,
|
|
}
|
|
|
|
struct Job {
|
|
@@ -25,6 +39,7 @@
|
|
Scheduler {
|
|
pending: VecDeque::new(),
|
|
completed: BTreeSet::new(),
|
|
+ supervised: BTreeMap::new(),
|
|
}
|
|
}
|
|
|
|
@@ -106,7 +121,7 @@
|
|
defer_count = 0;
|
|
|
|
let unit = unit_store.unit_mut(&job.unit);
|
|
- run(unit, init_config);
|
|
+ run(unit, init_config, &mut self.supervised);
|
|
self.completed.insert(job.unit);
|
|
spawned_this_step += 1;
|
|
|
|
@@ -119,7 +134,7 @@
|
|
}
|
|
}
|
|
|
|
-fn run(unit: &mut Unit, config: &mut InitConfig) {
|
|
+fn run(unit: &mut Unit, config: &mut InitConfig, supervised: &mut BTreeMap<u32, ServiceState>) {
|
|
match &unit.kind {
|
|
UnitKind::LegacyScript { script } => {
|
|
for cmd in script.clone() {
|
|
@@ -127,13 +142,28 @@
|
|
}
|
|
}
|
|
UnitKind::Service { service } => {
|
|
- let desc = unit.info.description.as_ref().unwrap_or(&unit.id.0);
|
|
+ let desc = unit.info.description.as_ref().unwrap_or(&unit.id.0).clone();
|
|
if config.skip_cmd.contains(&service.cmd) {
|
|
status_skip(&format!("Skipping {} ({})", desc, service.cmd));
|
|
return;
|
|
}
|
|
status_ok(&format!("Started {}", desc));
|
|
- service.spawn(&config.envs);
|
|
+ if let Some(pid) = service.spawn(&config.envs) {
|
|
+ // Only supervise services with a restart policy other than Never
|
|
+ if service.restart != RestartPolicy::Never {
|
|
+ supervised.insert(
|
|
+ pid,
|
|
+ ServiceState {
|
|
+ unit_id: unit.id.clone(),
|
|
+ cmd: service.cmd.clone(),
|
|
+ restart_policy: service.restart.clone(),
|
|
+ max_restarts: service.max_restarts,
|
|
+ restart_count: 0,
|
|
+ last_restart_ms: 0,
|
|
+ },
|
|
+ );
|
|
+ }
|
|
+ }
|
|
}
|
|
UnitKind::Target {} => {}
|
|
}
|
|
--- a/init/src/main.rs
|
|
+++ b/init/src/main.rs
|
|
@@ -5,7 +5,8 @@
|
|
|
|
use libredox::flag::{O_RDONLY, O_WRONLY};
|
|
|
|
-use crate::scheduler::Scheduler;
|
|
+use crate::scheduler::{Scheduler, ServiceState};
|
|
+use crate::service::RestartPolicy;
|
|
use crate::unit::{UnitId, UnitStore};
|
|
|
|
mod color;
|
|
@@ -176,15 +177,100 @@
|
|
if scheduler.has_pending() {
|
|
// Reap exited children before processing more services.
|
|
let mut status = 0;
|
|
- while libredox::call::waitpid(0, &mut status, 1).is_ok() {}
|
|
+ while let Ok(pid) = libredox::call::waitpid(0, &mut status, 1) {
|
|
+ handle_child_exit(pid as u32, status, &mut scheduler, &mut unit_store, &mut init_config);
|
|
+ }
|
|
|
|
scheduler.step(&mut unit_store, &mut init_config);
|
|
}
|
|
|
|
let mut status = 0;
|
|
match libredox::call::waitpid(0, &mut status, 1) {
|
|
- Ok(_pid) => {}
|
|
+ Ok(pid) => {
|
|
+ handle_child_exit(pid as u32, status, &mut scheduler, &mut unit_store, &mut init_config);
|
|
+ }
|
|
Err(err) => init_error(&format!("waitpid error: {}", err)),
|
|
}
|
|
}
|
|
}
|
|
+
|
|
+/// Handle a child process exit. If it's a supervised service, apply restart policy.
|
|
+fn handle_child_exit(
|
|
+ pid: u32,
|
|
+ exit_status: i32,
|
|
+ scheduler: &mut Scheduler,
|
|
+ unit_store: &mut UnitStore,
|
|
+ init_config: &mut InitConfig,
|
|
+) {
|
|
+ let Some(state) = scheduler.supervised.remove(&pid) else {
|
|
+ return; // Not a supervised service — just reap
|
|
+ };
|
|
+
|
|
+ let exited_cleanly = exit_status == 0;
|
|
+ let should_restart = match state.restart_policy {
|
|
+ RestartPolicy::Never => false,
|
|
+ RestartPolicy::OnFailure => !exited_cleanly,
|
|
+ RestartPolicy::Always => true,
|
|
+ };
|
|
+
|
|
+ if !should_restart {
|
|
+ init_warn(&format!(
|
|
+ "service {} (pid {}) exited with status {} — not restarting (policy: {:?})",
|
|
+ state.cmd, pid, exit_status, state.restart_policy,
|
|
+ ));
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if state.restart_count >= state.max_restarts {
|
|
+ init_warn(&format!(
|
|
+ "service {} exceeded max_restarts ({}) — giving up",
|
|
+ state.cmd, state.max_restarts,
|
|
+ ));
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ // Exponential backoff: 1s, 2s, 4s, 8s, ... up to 30s max
|
|
+ let backoff_secs = 1u64 << state.restart_count.min(4);
|
|
+ let backoff_secs = backoff_secs.min(30);
|
|
+ init_warn(&format!(
|
|
+ "service {} (pid {}) exited with status {} — restarting in {}s (attempt {}/{})",
|
|
+ state.cmd,
|
|
+ pid,
|
|
+ exit_status,
|
|
+ backoff_secs,
|
|
+ state.restart_count + 1,
|
|
+ state.max_restarts,
|
|
+ ));
|
|
+
|
|
+ // Sleep for backoff period (blocking, but init is the supervisor)
|
|
+ std::thread::sleep(std::time::Duration::from_secs(backoff_secs));
|
|
+
|
|
+ // Re-spawn the service by scheduling a restart
|
|
+ let unit_id = state.unit_id.clone();
|
|
+ let new_restart_count = state.restart_count + 1;
|
|
+
|
|
+ // Load the unit and re-run it
|
|
+ let unit = unit_store.unit_mut(&unit_id);
|
|
+ if let crate::unit::UnitKind::Service { service } = &unit.kind {
|
|
+ if let Some(new_pid) = service.spawn(&init_config.envs) {
|
|
+ init_warn(&format!(
|
|
+ "restarted service {} as pid {} (attempt {}/{})",
|
|
+ state.cmd, new_pid, new_restart_count, state.max_restarts,
|
|
+ ));
|
|
+ scheduler.supervised.insert(
|
|
+ new_pid,
|
|
+ ServiceState {
|
|
+ unit_id,
|
|
+ cmd: service.cmd.clone(),
|
|
+ restart_policy: state.restart_policy.clone(),
|
|
+ max_restarts: state.max_restarts,
|
|
+ restart_count: new_restart_count,
|
|
+ last_restart_ms: std::time::SystemTime::now()
|
|
+ .duration_since(std::time::UNIX_EPOCH)
|
|
+ .unwrap_or_default()
|
|
+ .as_millis() as u64,
|
|
+ },
|
|
+ );
|
|
+ }
|
|
+ }
|
|
+}
|