cleanup all worker stats when starting up servers

This commit is contained in:
2026-04-04 21:28:26 +02:00
parent 79f23b8be6
commit 9d33c6fded
6 changed files with 109 additions and 10 deletions
+6
View File
@@ -8,5 +8,11 @@ class MemoryMetricsStore:
async def load_all(self) -> list[dict]:
return [dict(value) for value in self._snapshots.values()]
async def clear_all(self) -> None:
self._snapshots.clear()
async def acquire_startup_cleanup_lock(self, lock_key:str, ttl_seconds:int=300) -> bool:
return True
async def close(self) -> None:
return None
+28 -5
View File
@@ -1,10 +1,12 @@
from server.metrics import MetricsStoreBuilder
import time, os
from typing import Any, Awaitable, cast
import time, os, inspect
class MetricsManager:
def __init__(self, backend:str="memory", redis_url:str="redis://localhost:6379/0", ttl_seconds:int=90, key_prefix:str="snake:metrics:worker", worker_id:str|None=None):
def __init__(self, backend:str="memory", redis_url:str="redis://localhost:6379/0", ttl_seconds:int|None=90, key_prefix:str="snake:metrics:worker", worker_id:str|None=None):
self.backend = (backend or "memory").strip().lower()
self.key_prefix = key_prefix
self.worker_id = worker_id or f"{os.getpid()}-{int(time.time() * 1000)}"
self.store = MetricsStoreBuilder.build(
backend=self.backend,
@@ -30,6 +32,27 @@ class MetricsManager:
async def close(self) -> None:
await self.store.close()
async def clear_all_workers(self) -> None:
clear_all = getattr(self.store, "clear_all", None)
if callable(clear_all):
maybe_result = clear_all()
if inspect.isawaitable(maybe_result):
await cast(Awaitable[Any], maybe_result)
async def acquire_startup_cleanup_lock(self, ttl_seconds:int=300) -> bool:
if self.backend != "redis":
return True
acquire_lock = getattr(self.store, "acquire_startup_cleanup_lock", None)
if not callable(acquire_lock):
return True
lock_key = f"{self.key_prefix}:startup_cleanup_lock"
maybe_result = acquire_lock(lock_key, ttl_seconds)
if inspect.isawaitable(maybe_result):
return bool(await cast(Awaitable[Any], maybe_result))
return bool(maybe_result)
def _merge_snapshots(self, snapshots:list[dict]) -> dict:
merged = {
"games_started": 0,
@@ -86,11 +109,11 @@ class MetricsManager:
merged["max_turn"] = max(merged["max_turn"], int(worker.get("max_turn", 0)))
merged["active_games_peak"] = max(merged["active_games_peak"], int(worker.get("active_games_peak", 0)))
merged["move_response_time_ms_max"] = max(merged["move_response_time_ms_max"], float(worker.get("move_response_time_ms_max", 0.0)))
merged["last_game_start_unix"] = max(merged["last_game_start_unix"], int(worker.get("last_game_start_unix", 0)),)
merged["last_game_start_unix"] = max(merged["last_game_start_unix"], int(worker.get("last_game_start_unix", 0)))
merged["last_game_end_unix"] = max(merged["last_game_end_unix"], int(worker.get("last_game_end_unix", 0)))
merged["last_move_unix"] = max(merged["last_move_unix"], int(worker.get("last_move_unix", 0)))
merged["oldest_active_game_age_sec"] = max(merged["oldest_active_game_age_sec"], int(worker.get("oldest_active_game_age_sec", 0)),)
merged["stale_game_timeout_sec"] = max(merged["stale_game_timeout_sec"], int(worker.get("stale_game_timeout_sec", 0)),)
merged["oldest_active_game_age_sec"] = max(merged["oldest_active_game_age_sec"], int(worker.get("oldest_active_game_age_sec", 0)))
merged["stale_game_timeout_sec"] = max(merged["stale_game_timeout_sec"], int(worker.get("stale_game_timeout_sec", 0)))
merged["game_state_local_cache_enabled"] = merged["game_state_local_cache_enabled"] or bool(worker.get("game_state_local_cache_enabled", False))
for endpoint in merged["http_requests_by_endpoint"]:
+12 -1
View File
@@ -2,7 +2,7 @@ import inspect
import json
class RedisMetricsStore:
def __init__(self, redis_url:str="redis://localhost:6379/0", key_prefix:str="snake:metrics:worker", ttl_seconds:int=None, **kwargs):
def __init__(self, redis_url:str="redis://localhost:6379/0", key_prefix:str="snake:metrics:worker", ttl_seconds:int|None=None, **kwargs):
self.redis_url = redis_url
self.key_prefix = key_prefix
self.ttl_seconds = ttl_seconds
@@ -41,6 +41,17 @@ class RedisMetricsStore:
continue
return snapshots
async def clear_all(self) -> None:
redis = await self._get_redis()
keys = await redis.keys(f"{self.key_prefix}:*")
if keys:
await redis.delete(*keys)
async def acquire_startup_cleanup_lock(self, lock_key:str, ttl_seconds:int=300) -> bool:
redis = await self._get_redis()
locked = await redis.set(lock_key, '1', ex=max(1, int(ttl_seconds)), nx=True)
return bool(locked)
async def close(self) -> None:
if self._redis is None:
return
+9 -3
View File
@@ -1,9 +1,9 @@
import time
from server.metrics.MetricsManager import MetricsManager
import time
class ServerMetricsCollector:
def __init__(self, metrics_manager:MetricsManager, game_state_local_cache:bool, metrics_backend:str, game_state_backend:str, stale_game_timeout_sec:int, game_last_seen_unix:dict, game_move_counts:dict,):
def __init__(self, metrics_manager:MetricsManager, game_state_local_cache:bool, metrics_backend:str, game_state_backend:str, stale_game_timeout_sec:int, game_last_seen_unix:dict, game_move_counts:dict):
self._manager = metrics_manager
self._stale_game_timeout_sec = stale_game_timeout_sec
self._game_last_seen_unix = game_last_seen_unix
@@ -167,6 +167,12 @@ class ServerMetricsCollector:
local_snapshot = self.build_local_snapshot(game_last_seen_unix, game_move_counts)
return await self._manager.snapshot(local_snapshot)
async def clear_worker_metrics(self) -> None:
await self._manager.clear_all_workers()
async def should_clear_worker_metrics_on_startup(self, lock_ttl_seconds:int=300) -> bool:
return await self._manager.acquire_startup_cleanup_lock(lock_ttl_seconds)
def build_prometheus_metrics(self, snapshot:dict) -> str:
lines = [
'# HELP snake_games_started_total Total games started by snake server.',