--- a/init/src/service.rs +++ b/init/src/service.rs @@ -40,6 +40,28 @@ pub inherit_envs: BTreeSet, #[serde(rename = "type")] pub type_: ServiceType, + /// Restart policy for this service. Default: Never (no restart on exit). + #[serde(default)] + pub restart: RestartPolicy, + /// Maximum consecutive restart attempts before giving up. Default: 3. + #[serde(default = "default_max_restarts")] + pub max_restarts: u32, +} + +fn default_max_restarts() -> u32 { + 3 +} + +#[derive(Clone, Debug, Default, Deserialize, PartialEq)] +#[serde(rename_all = "kebab-case")] +pub enum RestartPolicy { + /// Never restart (default — current behavior). + #[default] + Never, + /// Restart on non-zero exit code. + OnFailure, + /// Restart on any exit (including clean). + Always, } #[derive(Clone, Debug, Default, Deserialize)] @@ -53,7 +75,9 @@ } impl Service { - pub fn spawn(&self, base_envs: &BTreeMap) { + /// Returns Some(child_pid) for long-running services (Notify, Scheme), + /// None for Oneshot/OneshotAsync or if the spawn failed. + pub fn spawn(&self, base_envs: &BTreeMap) -> Option { let mut command = Command::new(&self.cmd); command.args(self.args.iter().map(|arg| subst_env(arg))); command.env_clear(); @@ -72,6 +96,7 @@ status_fail(&format!("failed to execute {:?}: {}", command, err)); } } + None } _ => { let (mut read_pipe, write_pipe) = io::pipe().unwrap(); @@ -85,7 +110,7 @@ let _ = unsafe { libc::close(write_raw) }; drop(read_pipe); status_fail(&format!("failed to execute {:?}: {}", command, err)); - return; + return None; } }; @@ -100,7 +125,7 @@ )); let _ = child.kill(); let _ = child.wait(); - return; + return None; } match read_pipe.read_exact(&mut [0]) { Ok(()) => {} @@ -111,6 +136,7 @@ init_error(&format!("failed to wait for {:?}: {}", command, err)); } } + Some(child.id()) } ServiceType::Scheme(scheme) => { if !poll_fd_timeout(read_pipe.as_raw_fd(), SERVICE_READY_TIMEOUT_SECS) { @@ -120,7 +146,7 @@ )); let _ = child.kill(); let _ = child.wait(); - return; + return None; } let mut new_fd = usize::MAX; loop { @@ -135,16 +161,16 @@ }) => continue, Ok(0) => { init_warn(&format!("{:?} exited without notifying readiness", command)); - return; + return None; } Ok(1) => break, Ok(n) => { init_error(&format!("incorrect amount of fds {} returned", n)); - return; + return None; } Err(err) => { init_error(&format!("failed to wait for {:?}: {}", command, err)); - return; + return None; } } } @@ -152,6 +178,7 @@ let current_namespace_fd = libredox::call::getns().expect("TODO"); libredox::call::register_scheme_to_ns(current_namespace_fd, scheme, new_fd) .expect("TODO"); + Some(child.id()) } ServiceType::Oneshot => { drop(read_pipe); @@ -165,6 +192,7 @@ init_error(&format!("failed to wait for {:?}: {}", command, err)) } } + None } ServiceType::OneshotAsync => unreachable!(), } --- a/init/src/scheduler.rs +++ b/init/src/scheduler.rs @@ -1,14 +1,28 @@ -use std::collections::{BTreeSet, VecDeque}; +use std::collections::{BTreeMap, BTreeSet, VecDeque}; use crate::InitConfig; -use crate::color::{init_error, status_ok, status_skip}; +use crate::color::{init_error, init_warn, status_ok, status_skip}; +use crate::service::RestartPolicy; use crate::unit::{Unit, UnitId, UnitKind, UnitStore}; const SPAWN_BATCH_SIZE: usize = 50; +/// Tracks the restart state for a supervised service. +pub struct ServiceState { + pub unit_id: UnitId, + pub cmd: String, + pub restart_policy: RestartPolicy, + pub max_restarts: u32, + pub restart_count: u32, + /// Monotonic time of last restart (for backoff calculation). + pub last_restart_ms: u64, +} + pub struct Scheduler { pending: VecDeque, completed: BTreeSet, + /// Maps child PID → service state for supervised services. + pub supervised: BTreeMap, } struct Job { @@ -25,6 +39,7 @@ Scheduler { pending: VecDeque::new(), completed: BTreeSet::new(), + supervised: BTreeMap::new(), } } @@ -106,7 +121,7 @@ defer_count = 0; let unit = unit_store.unit_mut(&job.unit); - run(unit, init_config); + run(unit, init_config, &mut self.supervised); self.completed.insert(job.unit); spawned_this_step += 1; @@ -119,7 +134,7 @@ } } -fn run(unit: &mut Unit, config: &mut InitConfig) { +fn run(unit: &mut Unit, config: &mut InitConfig, supervised: &mut BTreeMap) { match &unit.kind { UnitKind::LegacyScript { script } => { for cmd in script.clone() { @@ -127,13 +142,28 @@ } } UnitKind::Service { service } => { - let desc = unit.info.description.as_ref().unwrap_or(&unit.id.0); + let desc = unit.info.description.as_ref().unwrap_or(&unit.id.0).clone(); if config.skip_cmd.contains(&service.cmd) { status_skip(&format!("Skipping {} ({})", desc, service.cmd)); return; } status_ok(&format!("Started {}", desc)); - service.spawn(&config.envs); + if let Some(pid) = service.spawn(&config.envs) { + // Only supervise services with a restart policy other than Never + if service.restart != RestartPolicy::Never { + supervised.insert( + pid, + ServiceState { + unit_id: unit.id.clone(), + cmd: service.cmd.clone(), + restart_policy: service.restart.clone(), + max_restarts: service.max_restarts, + restart_count: 0, + last_restart_ms: 0, + }, + ); + } + } } UnitKind::Target {} => {} } --- a/init/src/main.rs +++ b/init/src/main.rs @@ -5,7 +5,8 @@ use libredox::flag::{O_RDONLY, O_WRONLY}; -use crate::scheduler::Scheduler; +use crate::scheduler::{Scheduler, ServiceState}; +use crate::service::RestartPolicy; use crate::unit::{UnitId, UnitStore}; mod color; @@ -176,15 +177,100 @@ if scheduler.has_pending() { // Reap exited children before processing more services. let mut status = 0; - while libredox::call::waitpid(0, &mut status, 1).is_ok() {} + while let Ok(pid) = libredox::call::waitpid(0, &mut status, 1) { + handle_child_exit(pid as u32, status, &mut scheduler, &mut unit_store, &mut init_config); + } scheduler.step(&mut unit_store, &mut init_config); } let mut status = 0; match libredox::call::waitpid(0, &mut status, 1) { - Ok(_pid) => {} + Ok(pid) => { + handle_child_exit(pid as u32, status, &mut scheduler, &mut unit_store, &mut init_config); + } Err(err) => init_error(&format!("waitpid error: {}", err)), } } } + +/// Handle a child process exit. If it's a supervised service, apply restart policy. +fn handle_child_exit( + pid: u32, + exit_status: i32, + scheduler: &mut Scheduler, + unit_store: &mut UnitStore, + init_config: &mut InitConfig, +) { + let Some(state) = scheduler.supervised.remove(&pid) else { + return; // Not a supervised service — just reap + }; + + let exited_cleanly = exit_status == 0; + let should_restart = match state.restart_policy { + RestartPolicy::Never => false, + RestartPolicy::OnFailure => !exited_cleanly, + RestartPolicy::Always => true, + }; + + if !should_restart { + init_warn(&format!( + "service {} (pid {}) exited with status {} — not restarting (policy: {:?})", + state.cmd, pid, exit_status, state.restart_policy, + )); + return; + } + + if state.restart_count >= state.max_restarts { + init_warn(&format!( + "service {} exceeded max_restarts ({}) — giving up", + state.cmd, state.max_restarts, + )); + return; + } + + // Exponential backoff: 1s, 2s, 4s, 8s, ... up to 30s max + let backoff_secs = 1u64 << state.restart_count.min(4); + let backoff_secs = backoff_secs.min(30); + init_warn(&format!( + "service {} (pid {}) exited with status {} — restarting in {}s (attempt {}/{})", + state.cmd, + pid, + exit_status, + backoff_secs, + state.restart_count + 1, + state.max_restarts, + )); + + // Sleep for backoff period (blocking, but init is the supervisor) + std::thread::sleep(std::time::Duration::from_secs(backoff_secs)); + + // Re-spawn the service by scheduling a restart + let unit_id = state.unit_id.clone(); + let new_restart_count = state.restart_count + 1; + + // Load the unit and re-run it + let unit = unit_store.unit_mut(&unit_id); + if let crate::unit::UnitKind::Service { service } = &unit.kind { + if let Some(new_pid) = service.spawn(&init_config.envs) { + init_warn(&format!( + "restarted service {} as pid {} (attempt {}/{})", + state.cmd, new_pid, new_restart_count, state.max_restarts, + )); + scheduler.supervised.insert( + new_pid, + ServiceState { + unit_id, + cmd: service.cmd.clone(), + restart_policy: state.restart_policy.clone(), + max_restarts: state.max_restarts, + restart_count: new_restart_count, + last_restart_ms: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64, + }, + ); + } + } +}