From c6ebb5834bc397e53d2be5466e3eb8c3f8dd6863 Mon Sep 17 00:00:00 2001 From: Daniel Dolezal Date: Sat, 4 Apr 2026 10:23:12 +0200 Subject: [PATCH] auto remove stuck games --- grafana/snake-metrics-dashboard.json | 309 ++++++++++++++++++++++++++- server/Server.py | 70 +++++- 2 files changed, 367 insertions(+), 12 deletions(-) diff --git a/grafana/snake-metrics-dashboard.json b/grafana/snake-metrics-dashboard.json index 2c8fe15..273b8d1 100644 --- a/grafana/snake-metrics-dashboard.json +++ b/grafana/snake-metrics-dashboard.json @@ -473,6 +473,14 @@ { "color": "green", "value": null + }, + { + "color": "yellow", + "value": 120 + }, + { + "color": "red", + "value": 250 } ] }, @@ -488,7 +496,7 @@ }, "id": 7, "options": { - "colorMode": "value", + "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", @@ -531,6 +539,10 @@ { "color": "green", "value": null + }, + { + "color": "red", + "value": 1 } ] } @@ -545,7 +557,7 @@ }, "id": 8, "options": { - "colorMode": "value", + "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", @@ -825,7 +837,42 @@ ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "avg move ms" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "orange" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "max move ms" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + }, + { + "id": "custom.lineWidth", + "value": 3 + } + ] + } + ] }, "gridPos": { "h": 8, @@ -872,6 +919,260 @@ ], "title": "Move Directions + Move Latency", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 24 + }, + "id": 13, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "editorMode": "code", + "expr": "snake_active_games_stale", + "legendFormat": "Stale Active Games", + "range": true, + "refId": "A" + } + ], + "title": "Stale Active Games", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 24 + }, + "id": 14, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "editorMode": "code", + "expr": "increase(snake_games_stuck_removed_total[$__range])", + "legendFormat": "Stuck Games Removed", + "range": true, + "refId": "A" + } + ], + "title": "Stuck Games Removed", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 90 + }, + { + "color": "red", + "value": 150 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 24 + }, + "id": 15, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "editorMode": "code", + "expr": "snake_oldest_active_game_age_sec", + "legendFormat": "Oldest Active Game Age", + "range": true, + "refId": "A" + } + ], + "title": "Oldest Active Game Age", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 24 + }, + "id": 16, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "editorMode": "code", + "expr": "snake_stale_game_timeout_sec", + "legendFormat": "Stale Timeout (Sec)", + "range": true, + "refId": "A" + } + ], + "title": "Stale Timeout (Sec)", + "type": "stat" } ], "refresh": "10s", @@ -912,6 +1213,6 @@ "timezone": "", "title": "Snake Performance", "uid": "snake-performance", - "version": 2, + "version": 3, "weekStart": "" } diff --git a/server/Server.py b/server/Server.py index d72dc59..83dd8bd 100644 --- a/server/Server.py +++ b/server/Server.py @@ -33,6 +33,8 @@ class Server: self.running_games:dict[str, GameBoard] = {} self.game_move_counts:dict[str, int] = {} + self.game_last_seen_unix:dict[str, int] = {} + self.stale_game_timeout_sec = self._get_stale_game_timeout_sec() self.metrics = { 'games_started': 0, 'games_ended': 0, @@ -49,9 +51,6 @@ class Server: 'start': 0, 'move': 0, 'end': 0, - 'cleanup': 0, - 'metrics': 0, - 'metrics_prometheus': 0, }, 'move_direction_counts': { 'up': 0, @@ -65,6 +64,7 @@ class Server: 'last_game_start_unix': 0, 'last_game_end_unix': 0, 'last_move_unix': 0, + 'games_stuck_removed': 0, } self.logger = build_logger('Battlesnake', debug_env_var='DEBUG_SERVER') self.snake_version = self._get_snake_version() @@ -86,6 +86,7 @@ class Server: @self.app.post('/start') async def on_start(): self._record_http_request('start') + self._prune_stale_games() game_state = await request.get_json() await self._create_game_board(game_state) await await_log(self.logger.info(f'GAME START: {game_state['game']}')) @@ -122,6 +123,7 @@ class Server: @self.app.post('/end') async def on_end(): self._record_http_request('end') + self._prune_stale_games() game_state = await request.get_json() if self.store_game_state: game_board = await self._get_game_board(game_state, end=True) @@ -130,7 +132,7 @@ class Server: StorageLoader.build(self.storage_type), file_path=os.path.join(self.data_path, 'data'), database=os.getenv('EDGEDB_DATABASE', None), - tls_security=None + tls_security=None, ) else: await game_board.save( @@ -208,7 +210,14 @@ class Server: return self.default_snake_config['version'] return str(version) - async def _create_game_board(self, game_state:dict): + def _get_stale_game_timeout_sec(self) -> int: + value = os.getenv('SNAKE_STUCK_GAME_TIMEOUT_SEC', '180') + try: + return max(30, int(value)) + except ValueError: + return 180 + + async def _create_game_board(self, game_state: dict): game_id = game_state['game']['id'] new_game_board = GameBoard( game_id=game_id, @@ -223,6 +232,7 @@ class Server: self.running_games[game_id] = new_game_board self.game_move_counts[game_id] = 0 + self.game_last_seen_unix[game_id] = int(time.time()) self.metrics['games_started'] += 1 self.metrics['active_games_peak'] = max( self.metrics['active_games_peak'], @@ -233,8 +243,9 @@ class Server: def _delete_game_board(self, game_state:dict): game_id = game_state['game']['id'] - del self.running_games[game_id] + self.running_games.pop(game_id, None) self.game_move_counts.pop(game_id, None) + self.game_last_seen_unix.pop(game_id, None) async def _get_game_board(self, game_state:dict, end:bool=False): game_id = game_state['game']['id'] @@ -248,6 +259,8 @@ class Server: self.metrics['total_moves'] += 1 self.game_move_counts[game_id] = self.game_move_counts.get(game_id, 0) + 1 + self.game_last_seen_unix[game_id] = int(time.time()) + game_board.read_game_data(game_state) if end: self._record_game_end(game_state) @@ -262,6 +275,22 @@ class Server: storage = StorageLoader.build(self.storage_type)() return storage.cleanup() + def _prune_stale_games(self): + if not self.running_games: + return + + now = int(time.time()) + stale_ids = [ + game_id + for game_id, last_seen in self.game_last_seen_unix.items() + if now - last_seen >= self.stale_game_timeout_sec + ] + for game_id in stale_ids: + self.running_games.pop(game_id, None) + self.game_move_counts.pop(game_id, None) + self.game_last_seen_unix.pop(game_id, None) + self.metrics['games_stuck_removed'] += 1 + def _record_game_end(self, game_state: dict): self.metrics['games_ended'] += 1 self.metrics['last_game_end_unix'] = int(time.time()) @@ -286,6 +315,16 @@ class Server: win_rate = self.metrics['wins'] / games_ended if games_ended else 0.0 avg_move_ms = self.metrics['move_response_time_ms_total'] / total_moves if total_moves else 0.0 + now = int(time.time()) + oldest_active_age = 0 + if self.game_last_seen_unix: + oldest_active_age = max(0, now - min(self.game_last_seen_unix.values())) + stale_candidates = sum( + 1 + for last_seen in self.game_last_seen_unix.values() + if now - last_seen >= self.stale_game_timeout_sec + ) + return { **self.metrics, 'active_games': len(self.running_games), @@ -295,6 +334,9 @@ class Server: 'avg_move_response_ms': round(avg_move_ms, 2), 'http_requests_by_endpoint': dict(self.metrics['http_requests_by_endpoint']), 'move_direction_counts': dict(self.metrics['move_direction_counts']), + 'oldest_active_game_age_sec': oldest_active_age, + 'stale_game_timeout_sec': self.stale_game_timeout_sec, + 'active_games_stale': stale_candidates, } def _record_http_request(self, endpoint:str): @@ -343,10 +385,10 @@ class Server: f'snake_http_requests_total {snapshot['http_requests_total']}', '# HELP snake_move_response_ms_total Total move endpoint compute time in milliseconds.', '# TYPE snake_move_response_ms_total counter', - f"snake_move_response_ms_total {round(snapshot['move_response_time_ms_total'], 3)}", + f'snake_move_response_ms_total {round(snapshot['move_response_time_ms_total'], 3)}', '# HELP snake_move_response_ms_max Maximum move endpoint compute time in milliseconds.', '# TYPE snake_move_response_ms_max gauge', - f"snake_move_response_ms_max {round(snapshot['move_response_time_ms_max'], 3)}", + f'snake_move_response_ms_max {round(snapshot['move_response_time_ms_max'], 3)}', '# HELP snake_avg_turns_per_game Average final turn per ended game.', '# TYPE snake_avg_turns_per_game gauge', f'snake_avg_turns_per_game {snapshot['avg_turns_per_game']}', @@ -365,6 +407,18 @@ class Server: '# HELP snake_last_move_unix Unix timestamp of most recent /move response.', '# TYPE snake_last_move_unix gauge', f'snake_last_move_unix {snapshot['last_move_unix']}', + '# HELP snake_games_stuck_removed_total Active games auto-removed due to inactivity timeout.', + '# TYPE snake_games_stuck_removed_total counter', + f'snake_games_stuck_removed_total {snapshot['games_stuck_removed']}', + '# HELP snake_oldest_active_game_age_sec Age in seconds of the oldest active game.', + '# TYPE snake_oldest_active_game_age_sec gauge', + f'snake_oldest_active_game_age_sec {snapshot['oldest_active_game_age_sec']}', + '# HELP snake_stale_game_timeout_sec Configured inactivity timeout for stale games.', + '# TYPE snake_stale_game_timeout_sec gauge', + f'snake_stale_game_timeout_sec {snapshot['stale_game_timeout_sec']}', + '# HELP snake_active_games_stale Active games currently beyond stale timeout.', + '# TYPE snake_active_games_stale gauge', + f'snake_active_games_stale {snapshot['active_games_stale']}', ] lines.extend([