diff --git a/selfdrive/manager/process.py b/selfdrive/manager/process.py index e7d0d59..026da18 100755 --- a/selfdrive/manager/process.py +++ b/selfdrive/manager/process.py @@ -47,13 +47,10 @@ def launcher(proc: str, name: str, log_path: str) -> None: raise def join_process(process: Process, timeout: float) -> None: - # Process().join(timeout) will hang due to a python 3 bug: https://bugs.python.org/issue28382 - # We have to poll the exitcode instead t = time.monotonic() while time.monotonic() - t < timeout and process.exitcode is None: time.sleep(0.001) - class ManagerProcess(ABC): daemon = False sigkill = False @@ -61,7 +58,6 @@ class ManagerProcess(ABC): proc: Process | None = None enabled = True name = "" - last_watchdog_time = 0 watchdog_max_dt: int | None = None watchdog_seen = False @@ -75,24 +71,22 @@ class ManagerProcess(ABC): def start(self) -> None: pass + def restart(self) -> None: - self.stop(sig=signal.SIGKILL) + if self.proc is not None and self.proc.exitcode is not None: + self.stop(sig=signal.SIGKILL, block=False) self.start() def check_watchdog(self, started: bool) -> None: if self.watchdog_max_dt is None or self.proc is None: return - try: fn = WATCHDOG_FN + str(self.proc.pid) with open(fn, "rb") as f: - # TODO: why can't pylint find struct.unpack? self.last_watchdog_time = struct.unpack('Q', f.read())[0] except Exception: pass - dt = time.monotonic() - self.last_watchdog_time / 1e9 - if dt > self.watchdog_max_dt: if (self.watchdog_seen or self.always_watchdog and self.proc.exitcode is not None) and ENABLE_WATCHDOG: cloudlog.error(f"Watchdog timeout for {self.name} (exitcode {self.proc.exitcode}) restarting ({started=})") @@ -103,7 +97,6 @@ class ManagerProcess(ABC): def stop(self, retry: bool = True, block: bool = True, sig: signal.Signals = None) -> int | None: if self.proc is None: return None - if self.proc.exitcode is None: if not self.shutting_down: cloudlog.info(f"killing {self.name}") @@ -111,39 +104,23 @@ class ManagerProcess(ABC): sig = signal.SIGKILL if self.sigkill else signal.SIGINT self.signal(sig) self.shutting_down = True - if not block: return None - join_process(self.proc, 5) - - # If process failed to die send SIGKILL if self.proc.exitcode is None and retry: cloudlog.info(f"killing {self.name} with SIGKILL") self.signal(signal.SIGKILL) self.proc.join() - ret = self.proc.exitcode cloudlog.info(f"{self.name} is dead with {ret}") - if self.proc.exitcode is not None: self.shutting_down = False self.proc = None - return ret def signal(self, sig: int) -> None: - if self.proc is None: + if self.proc is None or self.proc.exitcode is not None or self.proc.pid is None: return - - # Don't signal if already exited - if self.proc.exitcode is not None and self.proc.pid is not None: - return - - # Can't signal if we don't have a pid - if self.proc.pid is None: - return - cloudlog.info(f"sending signal {sig} to {self.name}") os.kill(self.proc.pid, sig) @@ -243,6 +220,7 @@ class DaemonProcess(ManagerProcess): def stop(self, retry=True, block=True, sig=None) -> None: pass + def ensure_running(procs: ValuesView[ManagerProcess], started: bool, params=None, CP: car.CarParams=None, not_run: list[str] | None=None, log_dir: str = None) -> list[ManagerProcess]: if not_run is None: not_run = [] @@ -251,11 +229,12 @@ def ensure_running(procs: ValuesView[ManagerProcess], started: bool, params=None for p in procs: log_path = log_dir+"/"+p.name+".log" if p.enabled and p.name not in not_run and p.should_run(started, params, CP): - p.start(log_path) + if p.proc is None or p.proc.exitcode is not None: + p.start(log_path) running.append(p) else: p.stop(block=False) p.check_watchdog(started) - return running + return running \ No newline at end of file