This commit is contained in:
Your Name
2024-06-15 20:59:47 -05:00
parent 6ef4519798
commit 4702c22aa3

View File

@@ -47,13 +47,10 @@ def launcher(proc: str, name: str, log_path: str) -> None:
raise raise
def join_process(process: Process, timeout: float) -> None: def join_process(process: Process, timeout: float) -> None:
# Process().join(timeout) will hang due to a python 3 bug: https://bugs.python.org/issue28382
# We have to poll the exitcode instead
t = time.monotonic() t = time.monotonic()
while time.monotonic() - t < timeout and process.exitcode is None: while time.monotonic() - t < timeout and process.exitcode is None:
time.sleep(0.001) time.sleep(0.001)
class ManagerProcess(ABC): class ManagerProcess(ABC):
daemon = False daemon = False
sigkill = False sigkill = False
@@ -61,7 +58,6 @@ class ManagerProcess(ABC):
proc: Process | None = None proc: Process | None = None
enabled = True enabled = True
name = "" name = ""
last_watchdog_time = 0 last_watchdog_time = 0
watchdog_max_dt: int | None = None watchdog_max_dt: int | None = None
watchdog_seen = False watchdog_seen = False
@@ -75,24 +71,22 @@ class ManagerProcess(ABC):
def start(self) -> None: def start(self) -> None:
pass pass
def restart(self) -> None: def restart(self) -> None:
self.stop(sig=signal.SIGKILL) if self.proc is not None and self.proc.exitcode is not None:
self.stop(sig=signal.SIGKILL, block=False)
self.start() self.start()
def check_watchdog(self, started: bool) -> None: def check_watchdog(self, started: bool) -> None:
if self.watchdog_max_dt is None or self.proc is None: if self.watchdog_max_dt is None or self.proc is None:
return return
try: try:
fn = WATCHDOG_FN + str(self.proc.pid) fn = WATCHDOG_FN + str(self.proc.pid)
with open(fn, "rb") as f: with open(fn, "rb") as f:
# TODO: why can't pylint find struct.unpack?
self.last_watchdog_time = struct.unpack('Q', f.read())[0] self.last_watchdog_time = struct.unpack('Q', f.read())[0]
except Exception: except Exception:
pass pass
dt = time.monotonic() - self.last_watchdog_time / 1e9 dt = time.monotonic() - self.last_watchdog_time / 1e9
if dt > self.watchdog_max_dt: if dt > self.watchdog_max_dt:
if (self.watchdog_seen or self.always_watchdog and self.proc.exitcode is not None) and ENABLE_WATCHDOG: if (self.watchdog_seen or self.always_watchdog and self.proc.exitcode is not None) and ENABLE_WATCHDOG:
cloudlog.error(f"Watchdog timeout for {self.name} (exitcode {self.proc.exitcode}) restarting ({started=})") cloudlog.error(f"Watchdog timeout for {self.name} (exitcode {self.proc.exitcode}) restarting ({started=})")
@@ -103,7 +97,6 @@ class ManagerProcess(ABC):
def stop(self, retry: bool = True, block: bool = True, sig: signal.Signals = None) -> int | None: def stop(self, retry: bool = True, block: bool = True, sig: signal.Signals = None) -> int | None:
if self.proc is None: if self.proc is None:
return None return None
if self.proc.exitcode is None: if self.proc.exitcode is None:
if not self.shutting_down: if not self.shutting_down:
cloudlog.info(f"killing {self.name}") cloudlog.info(f"killing {self.name}")
@@ -111,39 +104,23 @@ class ManagerProcess(ABC):
sig = signal.SIGKILL if self.sigkill else signal.SIGINT sig = signal.SIGKILL if self.sigkill else signal.SIGINT
self.signal(sig) self.signal(sig)
self.shutting_down = True self.shutting_down = True
if not block: if not block:
return None return None
join_process(self.proc, 5) join_process(self.proc, 5)
# If process failed to die send SIGKILL
if self.proc.exitcode is None and retry: if self.proc.exitcode is None and retry:
cloudlog.info(f"killing {self.name} with SIGKILL") cloudlog.info(f"killing {self.name} with SIGKILL")
self.signal(signal.SIGKILL) self.signal(signal.SIGKILL)
self.proc.join() self.proc.join()
ret = self.proc.exitcode ret = self.proc.exitcode
cloudlog.info(f"{self.name} is dead with {ret}") cloudlog.info(f"{self.name} is dead with {ret}")
if self.proc.exitcode is not None: if self.proc.exitcode is not None:
self.shutting_down = False self.shutting_down = False
self.proc = None self.proc = None
return ret return ret
def signal(self, sig: int) -> None: def signal(self, sig: int) -> None:
if self.proc is None: if self.proc is None or self.proc.exitcode is not None or self.proc.pid is None:
return return
# Don't signal if already exited
if self.proc.exitcode is not None and self.proc.pid is not None:
return
# Can't signal if we don't have a pid
if self.proc.pid is None:
return
cloudlog.info(f"sending signal {sig} to {self.name}") cloudlog.info(f"sending signal {sig} to {self.name}")
os.kill(self.proc.pid, sig) os.kill(self.proc.pid, sig)
@@ -243,6 +220,7 @@ class DaemonProcess(ManagerProcess):
def stop(self, retry=True, block=True, sig=None) -> None: def stop(self, retry=True, block=True, sig=None) -> None:
pass pass
def ensure_running(procs: ValuesView[ManagerProcess], started: bool, params=None, CP: car.CarParams=None, not_run: list[str] | None=None, log_dir: str = None) -> list[ManagerProcess]: def ensure_running(procs: ValuesView[ManagerProcess], started: bool, params=None, CP: car.CarParams=None, not_run: list[str] | None=None, log_dir: str = None) -> list[ManagerProcess]:
if not_run is None: if not_run is None:
not_run = [] not_run = []
@@ -251,11 +229,12 @@ def ensure_running(procs: ValuesView[ManagerProcess], started: bool, params=None
for p in procs: for p in procs:
log_path = log_dir+"/"+p.name+".log" log_path = log_dir+"/"+p.name+".log"
if p.enabled and p.name not in not_run and p.should_run(started, params, CP): if p.enabled and p.name not in not_run and p.should_run(started, params, CP):
p.start(log_path) if p.proc is None or p.proc.exitcode is not None:
p.start(log_path)
running.append(p) running.append(p)
else: else:
p.stop(block=False) p.stop(block=False)
p.check_watchdog(started) p.check_watchdog(started)
return running return running