wip
This commit is contained in:
@@ -47,13 +47,10 @@ def launcher(proc: str, name: str, log_path: str) -> None:
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
def join_process(process: Process, timeout: float) -> None:
|
def join_process(process: Process, timeout: float) -> None:
|
||||||
# Process().join(timeout) will hang due to a python 3 bug: https://bugs.python.org/issue28382
|
|
||||||
# We have to poll the exitcode instead
|
|
||||||
t = time.monotonic()
|
t = time.monotonic()
|
||||||
while time.monotonic() - t < timeout and process.exitcode is None:
|
while time.monotonic() - t < timeout and process.exitcode is None:
|
||||||
time.sleep(0.001)
|
time.sleep(0.001)
|
||||||
|
|
||||||
|
|
||||||
class ManagerProcess(ABC):
|
class ManagerProcess(ABC):
|
||||||
daemon = False
|
daemon = False
|
||||||
sigkill = False
|
sigkill = False
|
||||||
@@ -61,7 +58,6 @@ class ManagerProcess(ABC):
|
|||||||
proc: Process | None = None
|
proc: Process | None = None
|
||||||
enabled = True
|
enabled = True
|
||||||
name = ""
|
name = ""
|
||||||
|
|
||||||
last_watchdog_time = 0
|
last_watchdog_time = 0
|
||||||
watchdog_max_dt: int | None = None
|
watchdog_max_dt: int | None = None
|
||||||
watchdog_seen = False
|
watchdog_seen = False
|
||||||
@@ -75,24 +71,22 @@ class ManagerProcess(ABC):
|
|||||||
def start(self) -> None:
|
def start(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def restart(self) -> None:
|
def restart(self) -> None:
|
||||||
self.stop(sig=signal.SIGKILL)
|
if self.proc is not None and self.proc.exitcode is not None:
|
||||||
|
self.stop(sig=signal.SIGKILL, block=False)
|
||||||
self.start()
|
self.start()
|
||||||
|
|
||||||
def check_watchdog(self, started: bool) -> None:
|
def check_watchdog(self, started: bool) -> None:
|
||||||
if self.watchdog_max_dt is None or self.proc is None:
|
if self.watchdog_max_dt is None or self.proc is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
fn = WATCHDOG_FN + str(self.proc.pid)
|
fn = WATCHDOG_FN + str(self.proc.pid)
|
||||||
with open(fn, "rb") as f:
|
with open(fn, "rb") as f:
|
||||||
# TODO: why can't pylint find struct.unpack?
|
|
||||||
self.last_watchdog_time = struct.unpack('Q', f.read())[0]
|
self.last_watchdog_time = struct.unpack('Q', f.read())[0]
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
dt = time.monotonic() - self.last_watchdog_time / 1e9
|
dt = time.monotonic() - self.last_watchdog_time / 1e9
|
||||||
|
|
||||||
if dt > self.watchdog_max_dt:
|
if dt > self.watchdog_max_dt:
|
||||||
if (self.watchdog_seen or self.always_watchdog and self.proc.exitcode is not None) and ENABLE_WATCHDOG:
|
if (self.watchdog_seen or self.always_watchdog and self.proc.exitcode is not None) and ENABLE_WATCHDOG:
|
||||||
cloudlog.error(f"Watchdog timeout for {self.name} (exitcode {self.proc.exitcode}) restarting ({started=})")
|
cloudlog.error(f"Watchdog timeout for {self.name} (exitcode {self.proc.exitcode}) restarting ({started=})")
|
||||||
@@ -103,7 +97,6 @@ class ManagerProcess(ABC):
|
|||||||
def stop(self, retry: bool = True, block: bool = True, sig: signal.Signals = None) -> int | None:
|
def stop(self, retry: bool = True, block: bool = True, sig: signal.Signals = None) -> int | None:
|
||||||
if self.proc is None:
|
if self.proc is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if self.proc.exitcode is None:
|
if self.proc.exitcode is None:
|
||||||
if not self.shutting_down:
|
if not self.shutting_down:
|
||||||
cloudlog.info(f"killing {self.name}")
|
cloudlog.info(f"killing {self.name}")
|
||||||
@@ -111,39 +104,23 @@ class ManagerProcess(ABC):
|
|||||||
sig = signal.SIGKILL if self.sigkill else signal.SIGINT
|
sig = signal.SIGKILL if self.sigkill else signal.SIGINT
|
||||||
self.signal(sig)
|
self.signal(sig)
|
||||||
self.shutting_down = True
|
self.shutting_down = True
|
||||||
|
|
||||||
if not block:
|
if not block:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
join_process(self.proc, 5)
|
join_process(self.proc, 5)
|
||||||
|
|
||||||
# If process failed to die send SIGKILL
|
|
||||||
if self.proc.exitcode is None and retry:
|
if self.proc.exitcode is None and retry:
|
||||||
cloudlog.info(f"killing {self.name} with SIGKILL")
|
cloudlog.info(f"killing {self.name} with SIGKILL")
|
||||||
self.signal(signal.SIGKILL)
|
self.signal(signal.SIGKILL)
|
||||||
self.proc.join()
|
self.proc.join()
|
||||||
|
|
||||||
ret = self.proc.exitcode
|
ret = self.proc.exitcode
|
||||||
cloudlog.info(f"{self.name} is dead with {ret}")
|
cloudlog.info(f"{self.name} is dead with {ret}")
|
||||||
|
|
||||||
if self.proc.exitcode is not None:
|
if self.proc.exitcode is not None:
|
||||||
self.shutting_down = False
|
self.shutting_down = False
|
||||||
self.proc = None
|
self.proc = None
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def signal(self, sig: int) -> None:
|
def signal(self, sig: int) -> None:
|
||||||
if self.proc is None:
|
if self.proc is None or self.proc.exitcode is not None or self.proc.pid is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Don't signal if already exited
|
|
||||||
if self.proc.exitcode is not None and self.proc.pid is not None:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Can't signal if we don't have a pid
|
|
||||||
if self.proc.pid is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
cloudlog.info(f"sending signal {sig} to {self.name}")
|
cloudlog.info(f"sending signal {sig} to {self.name}")
|
||||||
os.kill(self.proc.pid, sig)
|
os.kill(self.proc.pid, sig)
|
||||||
|
|
||||||
@@ -243,6 +220,7 @@ class DaemonProcess(ManagerProcess):
|
|||||||
def stop(self, retry=True, block=True, sig=None) -> None:
|
def stop(self, retry=True, block=True, sig=None) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def ensure_running(procs: ValuesView[ManagerProcess], started: bool, params=None, CP: car.CarParams=None, not_run: list[str] | None=None, log_dir: str = None) -> list[ManagerProcess]:
|
def ensure_running(procs: ValuesView[ManagerProcess], started: bool, params=None, CP: car.CarParams=None, not_run: list[str] | None=None, log_dir: str = None) -> list[ManagerProcess]:
|
||||||
if not_run is None:
|
if not_run is None:
|
||||||
not_run = []
|
not_run = []
|
||||||
@@ -251,6 +229,7 @@ def ensure_running(procs: ValuesView[ManagerProcess], started: bool, params=None
|
|||||||
for p in procs:
|
for p in procs:
|
||||||
log_path = log_dir+"/"+p.name+".log"
|
log_path = log_dir+"/"+p.name+".log"
|
||||||
if p.enabled and p.name not in not_run and p.should_run(started, params, CP):
|
if p.enabled and p.name not in not_run and p.should_run(started, params, CP):
|
||||||
|
if p.proc is None or p.proc.exitcode is not None:
|
||||||
p.start(log_path)
|
p.start(log_path)
|
||||||
running.append(p)
|
running.append(p)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user