auto remove stuck games
Build and Push Docker Container / build-and-push (push) Successful in 51s

This commit is contained in:
2026-04-04 10:23:12 +02:00
parent dbcf9cadaf
commit c6ebb5834b
2 changed files with 367 additions and 12 deletions
+62 -8
View File
@@ -33,6 +33,8 @@ class Server:
self.running_games:dict[str, GameBoard] = {}
self.game_move_counts:dict[str, int] = {}
self.game_last_seen_unix:dict[str, int] = {}
self.stale_game_timeout_sec = self._get_stale_game_timeout_sec()
self.metrics = {
'games_started': 0,
'games_ended': 0,
@@ -49,9 +51,6 @@ class Server:
'start': 0,
'move': 0,
'end': 0,
'cleanup': 0,
'metrics': 0,
'metrics_prometheus': 0,
},
'move_direction_counts': {
'up': 0,
@@ -65,6 +64,7 @@ class Server:
'last_game_start_unix': 0,
'last_game_end_unix': 0,
'last_move_unix': 0,
'games_stuck_removed': 0,
}
self.logger = build_logger('Battlesnake', debug_env_var='DEBUG_SERVER')
self.snake_version = self._get_snake_version()
@@ -86,6 +86,7 @@ class Server:
@self.app.post('/start')
async def on_start():
self._record_http_request('start')
self._prune_stale_games()
game_state = await request.get_json()
await self._create_game_board(game_state)
await await_log(self.logger.info(f'GAME START: {game_state['game']}'))
@@ -122,6 +123,7 @@ class Server:
@self.app.post('/end')
async def on_end():
self._record_http_request('end')
self._prune_stale_games()
game_state = await request.get_json()
if self.store_game_state:
game_board = await self._get_game_board(game_state, end=True)
@@ -130,7 +132,7 @@ class Server:
StorageLoader.build(self.storage_type),
file_path=os.path.join(self.data_path, 'data'),
database=os.getenv('EDGEDB_DATABASE', None),
tls_security=None
tls_security=None,
)
else:
await game_board.save(
@@ -208,7 +210,14 @@ class Server:
return self.default_snake_config['version']
return str(version)
async def _create_game_board(self, game_state:dict):
def _get_stale_game_timeout_sec(self) -> int:
value = os.getenv('SNAKE_STUCK_GAME_TIMEOUT_SEC', '180')
try:
return max(30, int(value))
except ValueError:
return 180
async def _create_game_board(self, game_state: dict):
game_id = game_state['game']['id']
new_game_board = GameBoard(
game_id=game_id,
@@ -223,6 +232,7 @@ class Server:
self.running_games[game_id] = new_game_board
self.game_move_counts[game_id] = 0
self.game_last_seen_unix[game_id] = int(time.time())
self.metrics['games_started'] += 1
self.metrics['active_games_peak'] = max(
self.metrics['active_games_peak'],
@@ -233,8 +243,9 @@ class Server:
def _delete_game_board(self, game_state:dict):
game_id = game_state['game']['id']
del self.running_games[game_id]
self.running_games.pop(game_id, None)
self.game_move_counts.pop(game_id, None)
self.game_last_seen_unix.pop(game_id, None)
async def _get_game_board(self, game_state:dict, end:bool=False):
game_id = game_state['game']['id']
@@ -248,6 +259,8 @@ class Server:
self.metrics['total_moves'] += 1
self.game_move_counts[game_id] = self.game_move_counts.get(game_id, 0) + 1
self.game_last_seen_unix[game_id] = int(time.time())
game_board.read_game_data(game_state)
if end:
self._record_game_end(game_state)
@@ -262,6 +275,22 @@ class Server:
storage = StorageLoader.build(self.storage_type)()
return storage.cleanup()
def _prune_stale_games(self):
if not self.running_games:
return
now = int(time.time())
stale_ids = [
game_id
for game_id, last_seen in self.game_last_seen_unix.items()
if now - last_seen >= self.stale_game_timeout_sec
]
for game_id in stale_ids:
self.running_games.pop(game_id, None)
self.game_move_counts.pop(game_id, None)
self.game_last_seen_unix.pop(game_id, None)
self.metrics['games_stuck_removed'] += 1
def _record_game_end(self, game_state: dict):
self.metrics['games_ended'] += 1
self.metrics['last_game_end_unix'] = int(time.time())
@@ -286,6 +315,16 @@ class Server:
win_rate = self.metrics['wins'] / games_ended if games_ended else 0.0
avg_move_ms = self.metrics['move_response_time_ms_total'] / total_moves if total_moves else 0.0
now = int(time.time())
oldest_active_age = 0
if self.game_last_seen_unix:
oldest_active_age = max(0, now - min(self.game_last_seen_unix.values()))
stale_candidates = sum(
1
for last_seen in self.game_last_seen_unix.values()
if now - last_seen >= self.stale_game_timeout_sec
)
return {
**self.metrics,
'active_games': len(self.running_games),
@@ -295,6 +334,9 @@ class Server:
'avg_move_response_ms': round(avg_move_ms, 2),
'http_requests_by_endpoint': dict(self.metrics['http_requests_by_endpoint']),
'move_direction_counts': dict(self.metrics['move_direction_counts']),
'oldest_active_game_age_sec': oldest_active_age,
'stale_game_timeout_sec': self.stale_game_timeout_sec,
'active_games_stale': stale_candidates,
}
def _record_http_request(self, endpoint:str):
@@ -343,10 +385,10 @@ class Server:
f'snake_http_requests_total {snapshot['http_requests_total']}',
'# HELP snake_move_response_ms_total Total move endpoint compute time in milliseconds.',
'# TYPE snake_move_response_ms_total counter',
f"snake_move_response_ms_total {round(snapshot['move_response_time_ms_total'], 3)}",
f'snake_move_response_ms_total {round(snapshot['move_response_time_ms_total'], 3)}',
'# HELP snake_move_response_ms_max Maximum move endpoint compute time in milliseconds.',
'# TYPE snake_move_response_ms_max gauge',
f"snake_move_response_ms_max {round(snapshot['move_response_time_ms_max'], 3)}",
f'snake_move_response_ms_max {round(snapshot['move_response_time_ms_max'], 3)}',
'# HELP snake_avg_turns_per_game Average final turn per ended game.',
'# TYPE snake_avg_turns_per_game gauge',
f'snake_avg_turns_per_game {snapshot['avg_turns_per_game']}',
@@ -365,6 +407,18 @@ class Server:
'# HELP snake_last_move_unix Unix timestamp of most recent /move response.',
'# TYPE snake_last_move_unix gauge',
f'snake_last_move_unix {snapshot['last_move_unix']}',
'# HELP snake_games_stuck_removed_total Active games auto-removed due to inactivity timeout.',
'# TYPE snake_games_stuck_removed_total counter',
f'snake_games_stuck_removed_total {snapshot['games_stuck_removed']}',
'# HELP snake_oldest_active_game_age_sec Age in seconds of the oldest active game.',
'# TYPE snake_oldest_active_game_age_sec gauge',
f'snake_oldest_active_game_age_sec {snapshot['oldest_active_game_age_sec']}',
'# HELP snake_stale_game_timeout_sec Configured inactivity timeout for stale games.',
'# TYPE snake_stale_game_timeout_sec gauge',
f'snake_stale_game_timeout_sec {snapshot['stale_game_timeout_sec']}',
'# HELP snake_active_games_stale Active games currently beyond stale timeout.',
'# TYPE snake_active_games_stale gauge',
f'snake_active_games_stale {snapshot['active_games_stale']}',
]
lines.extend([