Source code for exlab_wizard.tray.quit_coordinator

"""Graceful tray shutdown coordinator. Backend Spec §4.3.2.

Steps documented in §4.3.2 (canonical):

1. Send the FastAPI lifespan shutdown signal; the server stops accepting
   new requests and ``POST /api/v1/sessions`` returns 503 with
   ``error.code: "shutting_down"``.
2. Wait up to **30 seconds** (5 seconds for SIGTERM at logoff, since the
   OS will hard-kill the process anyway) for the predicate
   ``SessionStore.active_sessions == 0 AND
   NASSyncClient.in_flight_jobs == 0``.
3. If the predicate becomes true within the window, exit cleanly.
4. If the timeout expires, prompt the operator with "1 operation still
   running. Force quit anyway?" via the open window if alive, otherwise
   via an OS notification. Operator picks **Force quit** (immediate
   shutdown; durable NAS-sync queue resumes on next launch) or **Wait**
   (resets the 30-second timer).

The coordinator is async because the predicate poll uses
``asyncio.sleep`` and the shutdown handoff to ``ServerRunner.stop``
naturally fits an async cleanup point. Tests can pass a 0-second timeout
to exercise the timeout branch deterministically.
"""

from __future__ import annotations

import asyncio
from collections.abc import Callable
from typing import TYPE_CHECKING, Any

from exlab_wizard.logging import get_logger

if TYPE_CHECKING:
    from exlab_wizard.tray.server_runner import ServerRunner
    from exlab_wizard.tray.window_launcher import WindowLauncher

__all__ = ["QuitCoordinator"]

_log = get_logger(__name__)


# Backend §4.3.2: 30-second normal timeout, 5-second SIGTERM timeout.
# Exposed as module-level constants so tests can monkeypatch.
DEFAULT_TIMEOUT_SECONDS: float = 30.0
SIGTERM_TIMEOUT_SECONDS: float = 5.0
PREDICATE_POLL_INTERVAL_SECONDS: float = 0.1


[docs] class QuitCoordinator: """Drive the §4.3.2 graceful-shutdown protocol. Construction-time dependencies are kept loosely typed so the coordinator can integrate with whatever stub fixtures unit tests pass. The runtime contract is documented per parameter. """ def __init__( self, *, server_runner: ServerRunner, window_launcher: WindowLauncher | None, session_store: Any, nas_sync: Any, on_force_quit_prompt: Callable[[], bool] | None = None, timeout_seconds: float = DEFAULT_TIMEOUT_SECONDS, sigterm_timeout_seconds: float = SIGTERM_TIMEOUT_SECONDS, poll_interval_seconds: float = PREDICATE_POLL_INTERVAL_SECONDS, ) -> None: self._server_runner = server_runner self._window_launcher = window_launcher self._session_store = session_store self._nas_sync = nas_sync self._on_force_quit_prompt = on_force_quit_prompt or (lambda: True) self._timeout = float(timeout_seconds) self._sigterm_timeout = float(sigterm_timeout_seconds) self._poll = float(poll_interval_seconds) # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------
[docs] async def quit(self, *, sigterm: bool = False) -> None: """Run the graceful-shutdown protocol. ``sigterm=True`` reduces the wait window to ``sigterm_timeout`` (5 s by default) because the OS will hard-kill the process shortly anyway. """ timeout = self._sigterm_timeout if sigterm else self._timeout _log.info("graceful shutdown initiated (sigterm=%s, timeout=%.1f)", sigterm, timeout) idle = await self._wait_for_idle(timeout) if not idle: _log.warning("graceful shutdown timed out; prompting for force-quit") if not self._prompt_force_quit(): _log.info("operator chose Wait; resetting timer") # Fresh wait window; one extra retry per §4.3.2. idle = await self._wait_for_idle(timeout) if not idle: _log.warning("still not idle after Wait; force-quitting anyway") # Tear down the live components in reverse-spawn order. if self._window_launcher is not None: self._window_launcher.close() self._server_runner.stop() _log.info("shutdown complete")
# ------------------------------------------------------------------ # Internals # ------------------------------------------------------------------ def _is_idle(self) -> bool: """Return True iff the §4.3.2 predicate holds. ``SessionStore.active_sessions == 0 AND NASSyncClient.in_flight_jobs == 0``. Both attributes are read defensively because the production wiring may pass loosely typed stubs (e.g. during setup-incomplete state where the session store is None). """ active = _safe_count(self._session_store, "active_sessions") in_flight = _safe_count(self._nas_sync, "in_flight_jobs") return active == 0 and in_flight == 0 async def _wait_for_idle(self, deadline_seconds: float) -> bool: """Poll :meth:`_is_idle` up to ``deadline_seconds`` seconds. Returns True if the predicate became true within the window. Sleeps in fixed intervals (``self._poll``) so a short deadline still produces a deterministic number of poll iterations in tests. Named ``deadline_seconds`` (not ``timeout``) to make clear that we are polling, not setting an asyncio cancellation timeout. """ if self._is_idle(): return True if deadline_seconds <= 0: return False deadline = asyncio.get_event_loop().time() + deadline_seconds while True: await asyncio.sleep(self._poll) if self._is_idle(): return True if asyncio.get_event_loop().time() >= deadline: return False def _prompt_force_quit(self) -> bool: """Invoke the operator-facing force-quit prompt. Returns ``True`` to indicate "force quit anyway", ``False`` for "wait". Backend §4.3.2: prompt routed through the open window if alive, otherwise via OS notification. The launcher passes the actual UI-bound prompt at construction time; the coordinator is agnostic to which path it takes. """ try: return bool(self._on_force_quit_prompt()) except Exception: _log.exception("force-quit prompt raised; defaulting to force-quit") return True
def _safe_count(obj: Any, attr: str) -> int: """Return ``int(obj.<attr>)`` defensively. Falls through ``None``, missing attributes, callable attributes (some implementations expose the count via a method), and non-numeric returns -- the worst-case behavior is a count of 0, which is the "idle" interpretation. The launcher logs a warning once at startup if the wiring is not as expected; the coordinator itself does not gate shutdown on diagnostics. """ if obj is None: return 0 raw = getattr(obj, attr, 0) if callable(raw): try: raw = raw() except Exception: return 0 try: return int(raw) except (TypeError, ValueError): return 0