Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ Every variable below is consumed by the container entrypoint, so it applies iden
|---|---|---|
| `WINDROSE_WEBHOOK_URL` | `` | Generic JSON `POST` target. |
| `WINDROSE_DISCORD_WEBHOOK_URL` | `` | Discord embed target. |
| `WINDROSE_WEBHOOK_EVENTS` | `server.online,server.offline,player.join,player.leave` | Comma-separated subset. Additional events: `backup.created`, `backup.restored`, `config.applied`. |
| `WINDROSE_WEBHOOK_EVENTS` | `server.online,server.offline,server.crashed,player.join,player.leave,backup.failed,backup.restore.failed,config.apply.failed` | Comma-separated subset. Additional opt-in success events: `backup.created`, `backup.restored`, `config.applied`. |
| `WINDROSE_WEBHOOK_POLL_SECONDS` | `15` | Poll cadence for the event detector thread. |
| `WINDROSE_WEBHOOK_TIMEOUT` | `5` | HTTP POST timeout (seconds). |

Expand Down Expand Up @@ -467,15 +467,19 @@ The admin console's UI container runs a small event detector in a background thr

Event types:

| Event | Fires when |
| ----------------- | ------------------------------------------------------------------ |
| `server.online` | The game process appears (post-restart or first boot). |
| `server.offline` | The game process goes away (crash, `stop`, pod eviction). |
| `player.join` | A new `AccountId` appears in the connected-players snapshot. |
| `player.leave` | An `AccountId` drops out of the connected-players snapshot. |
| `backup.created` | The admin console's **Create backup now**, `POST /api/backups`, or an auto-backup from the scheduler (payload includes `source: "auto"` + `reason: "idle"` or `"floor"`). |
| `backup.restored` | A backup is restored via `POST /api/backups/{id}/restore` or a game auto-backup is merged via `POST /api/game-backups/{ts}/restore`. |
| `config.applied` | Config changes are applied via **Apply + restart**. |
| Event | Fires when |
| ------------------------ | ------------------------------------------------------------------ |
| `server.online` | The game process appears (post-restart or first boot). |
| `server.offline` | The game process goes away within ~30 s of an operator-initiated `/api/server/stop` or `/api/server/restart` (clean shutdown). |
| `server.crashed` | The game process goes away with no recent operator stop/restart — i.e. an unexpected exit, OOM kill, pod eviction, etc. |
| `player.join` | A new `AccountId` appears in the connected-players snapshot. |
| `player.leave` | An `AccountId` drops out of the connected-players snapshot. |
| `backup.created` | The admin console's **Create backup now**, `POST /api/backups`, `POST /api/backups/upload`, or an auto-backup from the scheduler (payload includes `source: "auto"` + `reason: "idle"` or `"floor"`). |
| `backup.failed` | An auto, manual, or imported backup raised. Payload includes `source` (`auto` / `manual` / `imported`) and `reason`. |
| `backup.restored` | A backup is restored via `POST /api/backups/{id}/restore` or a game auto-backup is merged via `POST /api/game-backups/{ts}/restore`. |
| `backup.restore.failed` | A restore raised. Payload includes `backupId` and `reason`. |
| `config.applied` | Config changes are applied via **Apply + restart**. |
| `config.apply.failed` | Apply raised partway through (mod stop or mod apply). Payload includes `stage` and `reason`. |

Restrict the fired set with `WINDROSE_WEBHOOK_EVENTS` (comma-separated). Empty URLs disable delivery entirely — leave both URLs unset to suppress webhooks even if the event list is populated.

Expand Down
74 changes: 63 additions & 11 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,20 @@
WEBHOOK_URL = os.environ.get("WINDROSE_WEBHOOK_URL", "").strip()
WEBHOOK_DISCORD_URL = os.environ.get("WINDROSE_DISCORD_WEBHOOK_URL", "").strip()
WEBHOOK_EVENTS_RAW = os.environ.get("WINDROSE_WEBHOOK_EVENTS",
"server.online,server.offline,player.join,player.leave").strip()
"server.online,server.offline,server.crashed,"
"player.join,player.leave,"
"backup.failed,backup.restore.failed,config.apply.failed").strip()
WEBHOOK_TIMEOUT = float(os.environ.get("WINDROSE_WEBHOOK_TIMEOUT", "5"))
WEBHOOK_POLL_SECONDS = float(os.environ.get("WINDROSE_WEBHOOK_POLL_SECONDS", "15"))
WEBHOOK_EVENTS = {e.strip() for e in WEBHOOK_EVENTS_RAW.split(",") if e.strip()}

# Set by /api/server/{stop,restart} so EventDetector can distinguish an
# operator-initiated stop (fires server.offline) from an unexpected exit
# (fires server.crashed). Plain float to keep this lock-free; the worst
# race is one missed crash-classification across two consecutive 15s polls.
_LAST_OPERATOR_STOP_AT: float = 0.0
_OPERATOR_STOP_GRACE_SECONDS: float = 30.0

# --- Utility: resource quantity parsing -------------------------------------
def parse_cpu_to_mcpu(q: str) -> int:
if not q:
Expand Down Expand Up @@ -983,6 +992,7 @@ def trigger_auto_backup(reason: str) -> dict | None:
print(f"[auto-backup] {msg}", file=sys.stderr, flush=True)
with _auto_state_lock:
_auto_state["lastResult"] = msg
fire_event("backup.failed", source="auto", reason=str(e), trigger=reason)
return None
bkp_dir = Path(bkp["path"])
_mark_auto_backup(bkp_dir)
Expand Down Expand Up @@ -2016,13 +2026,17 @@ def handle_upload(body_stream, content_length: int, filename: str) -> dict:
# gets a raw JSON body.

_WEBHOOK_COLORS = {
"server.online": 0x2ecc71, # green
"server.offline": 0xe74c3c, # red
"player.join": 0x3498db, # blue
"player.leave": 0x95a5a6, # grey
"config.applied": 0xf1c40f, # yellow
"backup.created": 0x9b59b6, # purple
"backup.restored": 0x8e44ad,
"server.online": 0x2ecc71, # green
"server.offline": 0xe74c3c, # red
"server.crashed": 0xc0392b, # darker red
"player.join": 0x3498db, # blue
"player.leave": 0x95a5a6, # grey
"config.applied": 0xf1c40f, # yellow
"config.apply.failed": 0xc0392b, # darker red
"backup.created": 0x9b59b6, # purple
"backup.restored": 0x8e44ad,
"backup.failed": 0xc0392b, # darker red
"backup.restore.failed": 0xc0392b, # darker red
}

def redact_url(url: str) -> str:
Expand Down Expand Up @@ -2066,7 +2080,9 @@ def build_discord_payload(event: dict) -> dict:
lines.append(f"**Invite:** `{event.get('inviteCode','?')}`")
lines.append(f"**Region:** {event.get('backendRegion','?') or '?'}")
elif name == "server.offline":
lines.append("Server went down (game process no longer visible).")
lines.append("Server stopped (operator-initiated within the grace window).")
elif name == "server.crashed":
lines.append("Server went down unexpectedly (no recent stop/restart from the admin UI).")
elif name == "player.join":
lines.append(f"**{event.get('name','?')}** joined")
lines.append(f"Players: {event.get('playerCount',0)} / {event.get('maxPlayerCount','?')}")
Expand All @@ -2075,8 +2091,17 @@ def build_discord_payload(event: dict) -> dict:
lines.append(f"Players: {event.get('playerCount',0)} / {event.get('maxPlayerCount','?')}")
elif name == "config.applied":
lines.append("Staged config applied — server restarting.")
elif name == "config.apply.failed":
lines.append(f"**Stage:** {event.get('stage','?')}")
lines.append(f"**Reason:** {event.get('reason','?')}")
elif name in ("backup.created", "backup.restored"):
lines.append(f"Backup: `{event.get('backupId','?')}`")
elif name == "backup.failed":
lines.append(f"**Source:** {event.get('source','?')}")
lines.append(f"**Reason:** {event.get('reason','?')}")
elif name == "backup.restore.failed":
lines.append(f"**Backup:** `{event.get('backupId','?')}`")
lines.append(f"**Reason:** {event.get('reason','?')}")
footer = f"{event.get('serverName','Windrose')} · {event.get('timestamp','')}"
return {
"embeds": [{
Expand Down Expand Up @@ -2157,7 +2182,17 @@ def run(self) -> None:
if self._prev_online is False and online:
fire_event("server.online", **common)
elif self._prev_online is True and not online:
fire_event("server.offline", **common)
# Distinguish operator-initiated stop from unexpected
# exit: if /api/server/{stop,restart} fired within the
# last _OPERATOR_STOP_GRACE_SECONDS, treat as a clean
# offline; otherwise the game went away on its own and
# we report a crash. Both events get the same payload
# shape so subscribers can treat them interchangeably.
elapsed = time.time() - _LAST_OPERATOR_STOP_AT
if elapsed < _OPERATOR_STOP_GRACE_SECONDS:
fire_event("server.offline", **common)
else:
fire_event("server.crashed", **common)
joined = set(players) - set(self._prev_players)
left = set(self._prev_players) - set(players)
for aid in joined:
Expand Down Expand Up @@ -2523,6 +2558,8 @@ def _api_server_stop(self):
# auto-restart. On k8s/compose where systemctl isn't reachable,
# fall through to the SIGTERM path (kubelet / compose restart
# policies will typically bring the game back).
global _LAST_OPERATOR_STOP_AT
_LAST_OPERATOR_STOP_AT = time.time()
if _systemctl_available():
ok, msg = systemctl_dispatch("stop")
if ok:
Expand Down Expand Up @@ -2646,6 +2683,7 @@ def _api_config_apply(self):
if resume_after_mod_apply:
set_maintenance_flag(False)
request_restart_later()
fire_event("config.apply.failed", stage="stop-for-mods", reason=msg)
self._send(HTTPStatus.CONFLICT, "text/plain",
f"game did not stop before mod apply: {msg}; staged changes left pending\n".encode())
return
Expand All @@ -2656,6 +2694,7 @@ def _api_config_apply(self):
if resume_after_mod_apply:
set_maintenance_flag(False)
request_restart_later()
fire_event("config.apply.failed", stage="apply-mods", reason=str(e))
self._send(HTTPStatus.BAD_REQUEST, "text/plain", f"mod apply failed: {e}\n".encode())
return
if have_server:
Expand Down Expand Up @@ -2707,6 +2746,8 @@ def _api_server_restart(self):
restart-on-exit otherwise."""
if not allow_destructive():
self._forbidden(); return
global _LAST_OPERATOR_STOP_AT
_LAST_OPERATOR_STOP_AT = time.time()
if _systemctl_available():
ok, msg = systemctl_dispatch("restart")
if ok:
Expand Down Expand Up @@ -2826,7 +2867,12 @@ def _api_backups_create(self):
pass # treat as unpinned; don't fail just because body is junk
if "pin=1" in (self.headers.get("X-Query-String", "") or ""):
pin = True
bkp = create_backup(pin=pin)
try:
bkp = create_backup(pin=pin)
except Exception as e: # noqa: BLE001 — surface to operator + webhook
fire_event("backup.failed", source="manual", reason=str(e), pinned=pin)
self._send(HTTPStatus.INTERNAL_SERVER_ERROR, "text/plain", f"{e}\n".encode())
return
fire_event("backup.created", backupId=bkp.get("id", ""))
self._json(HTTPStatus.OK, bkp)

Expand Down Expand Up @@ -2889,8 +2935,10 @@ def _api_backups_restore(self, bid: str):
try:
restore_backup(bid)
except FileNotFoundError:
fire_event("backup.restore.failed", backupId=bid, reason="no such backup")
self._send(HTTPStatus.NOT_FOUND, "text/plain", b"no such backup\n"); return
except Exception as e:
fire_event("backup.restore.failed", backupId=bid, reason=str(e))
self._send(HTTPStatus.INTERNAL_SERVER_ERROR, "text/plain", f"{e}\n".encode()); return
request_restart()
fire_event("backup.restored", backupId=bid)
Expand Down Expand Up @@ -2930,8 +2978,10 @@ def _api_backups_upload(self):
try:
bkp = import_backup_archive(self.rfile, length, filename)
except ValueError as e:
fire_event("backup.failed", source="imported", reason=str(e), filename=filename)
self._send(HTTPStatus.BAD_REQUEST, "text/plain", f"{e}\n".encode()); return
except Exception as e: # noqa: BLE001
fire_event("backup.failed", source="imported", reason=str(e), filename=filename)
self._send(HTTPStatus.INTERNAL_SERVER_ERROR, "text/plain", f"{e}\n".encode()); return
fire_event("backup.created", backupId=bkp["id"], source="imported")
self._json(HTTPStatus.OK, bkp)
Expand Down Expand Up @@ -2977,8 +3027,10 @@ def _api_game_backups_restore(self, ts: str):
try:
restore_game_backup(ts)
except FileNotFoundError:
fire_event("backup.restore.failed", backupId=f"game:{ts}", reason="no such game backup")
self._send(HTTPStatus.NOT_FOUND, "text/plain", b"no such game backup\n"); return
except Exception as e:
fire_event("backup.restore.failed", backupId=f"game:{ts}", reason=str(e))
self._send(HTTPStatus.INTERNAL_SERVER_ERROR, "text/plain", f"{e}\n".encode()); return
request_restart()
fire_event("backup.restored", backupId=f"game:{ts}")
Expand Down
Loading