Files
RedBear-OS/local/patches/base/P18-1-daemon-restart.patch
T
vasilito cee25393d8 fix: boot process improvements — dependency cycle, INIT_NOTIFY, probing loop, and log spam fixes
- Fix P15-8-init-cycle-detection.patch: replace visiting+error with seen+silent-skip
  to eliminate 11 false-positive 'dependency cycle detected' errors on shared deps
- Fix P0-daemon-fix-init-notify-unwrap.patch: remove eprintln! for missing
  INIT_NOTIFY (expected for oneshot_async services, ~7 daemons affected)
- Fix driver-manager hotplug loop: add PERMANENTLY_SKIPPED static set shared
  between hotplug handler and DriverConfig::probe() to stop infinite re-probing
  of Fatal/NotSupported/deferred-exhausted device+driver pairs (e.g. ided)
- Fix driver-manager log_timeline: suppress repeated EPIPE/ENOENT errors with
  AtomicI32 dedup and AtomicBool one-shot guards for boot timeline JSON
- Add driver-manager SIGTERM handler, ACPI bus registration, --status mode,
  driver reap loop, graceful shutdown, and reduced deferred retries (30→3)
2026-05-17 12:34:02 +03:00

327 lines
12 KiB
Diff

--- a/init/src/service.rs
+++ b/init/src/service.rs
@@ -40,6 +40,28 @@
pub inherit_envs: BTreeSet<String>,
#[serde(rename = "type")]
pub type_: ServiceType,
+ /// Restart policy for this service. Default: Never (no restart on exit).
+ #[serde(default)]
+ pub restart: RestartPolicy,
+ /// Maximum consecutive restart attempts before giving up. Default: 3.
+ #[serde(default = "default_max_restarts")]
+ pub max_restarts: u32,
+}
+
+fn default_max_restarts() -> u32 {
+ 3
+}
+
+#[derive(Clone, Debug, Default, Deserialize, PartialEq)]
+#[serde(rename_all = "kebab-case")]
+pub enum RestartPolicy {
+ /// Never restart (default — current behavior).
+ #[default]
+ Never,
+ /// Restart on non-zero exit code.
+ OnFailure,
+ /// Restart on any exit (including clean).
+ Always,
}
#[derive(Clone, Debug, Default, Deserialize)]
@@ -53,7 +75,9 @@
}
impl Service {
- pub fn spawn(&self, base_envs: &BTreeMap<String, OsString>) {
+ /// Returns Some(child_pid) for long-running services (Notify, Scheme),
+ /// None for Oneshot/OneshotAsync or if the spawn failed.
+ pub fn spawn(&self, base_envs: &BTreeMap<String, OsString>) -> Option<u32> {
let mut command = Command::new(&self.cmd);
command.args(self.args.iter().map(|arg| subst_env(arg)));
command.env_clear();
@@ -72,6 +96,7 @@
status_fail(&format!("failed to execute {:?}: {}", command, err));
}
}
+ None
}
_ => {
let (mut read_pipe, write_pipe) = io::pipe().unwrap();
@@ -85,7 +110,7 @@
let _ = unsafe { libc::close(write_raw) };
drop(read_pipe);
status_fail(&format!("failed to execute {:?}: {}", command, err));
- return;
+ return None;
}
};
@@ -100,7 +125,7 @@
));
let _ = child.kill();
let _ = child.wait();
- return;
+ return None;
}
match read_pipe.read_exact(&mut [0]) {
Ok(()) => {}
@@ -111,6 +136,7 @@
init_error(&format!("failed to wait for {:?}: {}", command, err));
}
}
+ Some(child.id())
}
ServiceType::Scheme(scheme) => {
if !poll_fd_timeout(read_pipe.as_raw_fd(), SERVICE_READY_TIMEOUT_SECS) {
@@ -120,7 +146,7 @@
));
let _ = child.kill();
let _ = child.wait();
- return;
+ return None;
}
let mut new_fd = usize::MAX;
loop {
@@ -135,16 +161,16 @@
}) => continue,
Ok(0) => {
init_warn(&format!("{:?} exited without notifying readiness", command));
- return;
+ return None;
}
Ok(1) => break,
Ok(n) => {
init_error(&format!("incorrect amount of fds {} returned", n));
- return;
+ return None;
}
Err(err) => {
init_error(&format!("failed to wait for {:?}: {}", command, err));
- return;
+ return None;
}
}
}
@@ -152,6 +178,7 @@
let current_namespace_fd = libredox::call::getns().expect("TODO");
libredox::call::register_scheme_to_ns(current_namespace_fd, scheme, new_fd)
.expect("TODO");
+ Some(child.id())
}
ServiceType::Oneshot => {
drop(read_pipe);
@@ -165,6 +192,7 @@
init_error(&format!("failed to wait for {:?}: {}", command, err))
}
}
+ None
}
ServiceType::OneshotAsync => unreachable!(),
}
--- a/init/src/scheduler.rs
+++ b/init/src/scheduler.rs
@@ -1,14 +1,28 @@
-use std::collections::{BTreeSet, VecDeque};
+use std::collections::{BTreeMap, BTreeSet, VecDeque};
use crate::InitConfig;
-use crate::color::{init_error, status_ok, status_skip};
+use crate::color::{init_error, init_warn, status_ok, status_skip};
+use crate::service::RestartPolicy;
use crate::unit::{Unit, UnitId, UnitKind, UnitStore};
const SPAWN_BATCH_SIZE: usize = 50;
+/// Tracks the restart state for a supervised service.
+pub struct ServiceState {
+ pub unit_id: UnitId,
+ pub cmd: String,
+ pub restart_policy: RestartPolicy,
+ pub max_restarts: u32,
+ pub restart_count: u32,
+ /// Monotonic time of last restart (for backoff calculation).
+ pub last_restart_ms: u64,
+}
+
pub struct Scheduler {
pending: VecDeque<Job>,
completed: BTreeSet<UnitId>,
+ /// Maps child PID → service state for supervised services.
+ pub supervised: BTreeMap<u32, ServiceState>,
}
struct Job {
@@ -25,6 +39,7 @@
Scheduler {
pending: VecDeque::new(),
completed: BTreeSet::new(),
+ supervised: BTreeMap::new(),
}
}
@@ -106,7 +121,7 @@
defer_count = 0;
let unit = unit_store.unit_mut(&job.unit);
- run(unit, init_config);
+ run(unit, init_config, &mut self.supervised);
self.completed.insert(job.unit);
spawned_this_step += 1;
@@ -119,7 +134,7 @@
}
}
-fn run(unit: &mut Unit, config: &mut InitConfig) {
+fn run(unit: &mut Unit, config: &mut InitConfig, supervised: &mut BTreeMap<u32, ServiceState>) {
match &unit.kind {
UnitKind::LegacyScript { script } => {
for cmd in script.clone() {
@@ -127,13 +142,28 @@
}
}
UnitKind::Service { service } => {
- let desc = unit.info.description.as_ref().unwrap_or(&unit.id.0);
+ let desc = unit.info.description.as_ref().unwrap_or(&unit.id.0).clone();
if config.skip_cmd.contains(&service.cmd) {
status_skip(&format!("Skipping {} ({})", desc, service.cmd));
return;
}
status_ok(&format!("Started {}", desc));
- service.spawn(&config.envs);
+ if let Some(pid) = service.spawn(&config.envs) {
+ // Only supervise services with a restart policy other than Never
+ if service.restart != RestartPolicy::Never {
+ supervised.insert(
+ pid,
+ ServiceState {
+ unit_id: unit.id.clone(),
+ cmd: service.cmd.clone(),
+ restart_policy: service.restart.clone(),
+ max_restarts: service.max_restarts,
+ restart_count: 0,
+ last_restart_ms: 0,
+ },
+ );
+ }
+ }
}
UnitKind::Target {} => {}
}
--- a/init/src/main.rs
+++ b/init/src/main.rs
@@ -5,7 +5,8 @@
use libredox::flag::{O_RDONLY, O_WRONLY};
-use crate::scheduler::Scheduler;
+use crate::scheduler::{Scheduler, ServiceState};
+use crate::service::RestartPolicy;
use crate::unit::{UnitId, UnitStore};
mod color;
@@ -176,15 +177,100 @@
if scheduler.has_pending() {
// Reap exited children before processing more services.
let mut status = 0;
- while libredox::call::waitpid(0, &mut status, 1).is_ok() {}
+ while let Ok(pid) = libredox::call::waitpid(0, &mut status, 1) {
+ handle_child_exit(pid as u32, status, &mut scheduler, &mut unit_store, &mut init_config);
+ }
scheduler.step(&mut unit_store, &mut init_config);
}
let mut status = 0;
match libredox::call::waitpid(0, &mut status, 1) {
- Ok(_pid) => {}
+ Ok(pid) => {
+ handle_child_exit(pid as u32, status, &mut scheduler, &mut unit_store, &mut init_config);
+ }
Err(err) => init_error(&format!("waitpid error: {}", err)),
}
}
}
+
+/// Handle a child process exit. If it's a supervised service, apply restart policy.
+fn handle_child_exit(
+ pid: u32,
+ exit_status: i32,
+ scheduler: &mut Scheduler,
+ unit_store: &mut UnitStore,
+ init_config: &mut InitConfig,
+) {
+ let Some(state) = scheduler.supervised.remove(&pid) else {
+ return; // Not a supervised service — just reap
+ };
+
+ let exited_cleanly = exit_status == 0;
+ let should_restart = match state.restart_policy {
+ RestartPolicy::Never => false,
+ RestartPolicy::OnFailure => !exited_cleanly,
+ RestartPolicy::Always => true,
+ };
+
+ if !should_restart {
+ init_warn(&format!(
+ "service {} (pid {}) exited with status {} — not restarting (policy: {:?})",
+ state.cmd, pid, exit_status, state.restart_policy,
+ ));
+ return;
+ }
+
+ if state.restart_count >= state.max_restarts {
+ init_warn(&format!(
+ "service {} exceeded max_restarts ({}) — giving up",
+ state.cmd, state.max_restarts,
+ ));
+ return;
+ }
+
+ // Exponential backoff: 1s, 2s, 4s, 8s, ... up to 30s max
+ let backoff_secs = 1u64 << state.restart_count.min(4);
+ let backoff_secs = backoff_secs.min(30);
+ init_warn(&format!(
+ "service {} (pid {}) exited with status {} — restarting in {}s (attempt {}/{})",
+ state.cmd,
+ pid,
+ exit_status,
+ backoff_secs,
+ state.restart_count + 1,
+ state.max_restarts,
+ ));
+
+ // Sleep for backoff period (blocking, but init is the supervisor)
+ std::thread::sleep(std::time::Duration::from_secs(backoff_secs));
+
+ // Re-spawn the service by scheduling a restart
+ let unit_id = state.unit_id.clone();
+ let new_restart_count = state.restart_count + 1;
+
+ // Load the unit and re-run it
+ let unit = unit_store.unit_mut(&unit_id);
+ if let crate::unit::UnitKind::Service { service } = &unit.kind {
+ if let Some(new_pid) = service.spawn(&init_config.envs) {
+ init_warn(&format!(
+ "restarted service {} as pid {} (attempt {}/{})",
+ state.cmd, new_pid, new_restart_count, state.max_restarts,
+ ));
+ scheduler.supervised.insert(
+ new_pid,
+ ServiceState {
+ unit_id,
+ cmd: service.cmd.clone(),
+ restart_policy: state.restart_policy.clone(),
+ max_restarts: state.max_restarts,
+ restart_count: new_restart_count,
+ last_restart_ms: std::time::SystemTime::now()
+ .duration_since(std::time::UNIX_EPOCH)
+ .unwrap_or_default()
+ .as_millis() as u64,
+ },
+ );
+ }
+ }
+}