auto remove stuck games
Build and Push Docker Container / build-and-push (push) Successful in 51s

This commit is contained in:
2026-04-04 10:23:12 +02:00
parent dbcf9cadaf
commit c6ebb5834b
2 changed files with 367 additions and 12 deletions
+305 -4
View File
@@ -473,6 +473,14 @@
{ {
"color": "green", "color": "green",
"value": null "value": null
},
{
"color": "yellow",
"value": 120
},
{
"color": "red",
"value": 250
} }
] ]
}, },
@@ -488,7 +496,7 @@
}, },
"id": 7, "id": 7,
"options": { "options": {
"colorMode": "value", "colorMode": "background",
"graphMode": "none", "graphMode": "none",
"justifyMode": "auto", "justifyMode": "auto",
"orientation": "auto", "orientation": "auto",
@@ -531,6 +539,10 @@
{ {
"color": "green", "color": "green",
"value": null "value": null
},
{
"color": "red",
"value": 1
} }
] ]
} }
@@ -545,7 +557,7 @@
}, },
"id": 8, "id": 8,
"options": { "options": {
"colorMode": "value", "colorMode": "background",
"graphMode": "none", "graphMode": "none",
"justifyMode": "auto", "justifyMode": "auto",
"orientation": "auto", "orientation": "auto",
@@ -825,7 +837,42 @@
] ]
} }
}, },
"overrides": [] "overrides": [
{
"matcher": {
"id": "byName",
"options": "avg move ms"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "orange"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "max move ms"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "red"
}
},
{
"id": "custom.lineWidth",
"value": 3
}
]
}
]
}, },
"gridPos": { "gridPos": {
"h": 8, "h": 8,
@@ -872,6 +919,260 @@
], ],
"title": "Move Directions + Move Latency", "title": "Move Directions + Move Latency",
"type": "timeseries" "type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 2
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 24
},
"id": 13,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "11.0.0",
"targets": [
{
"editorMode": "code",
"expr": "snake_active_games_stale",
"legendFormat": "Stale Active Games",
"range": true,
"refId": "A"
}
],
"title": "Stale Active Games",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 5
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 24
},
"id": 14,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "11.0.0",
"targets": [
{
"editorMode": "code",
"expr": "increase(snake_games_stuck_removed_total[$__range])",
"legendFormat": "Stuck Games Removed",
"range": true,
"refId": "A"
}
],
"title": "Stuck Games Removed",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 90
},
{
"color": "red",
"value": 150
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 24
},
"id": 15,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "11.0.0",
"targets": [
{
"editorMode": "code",
"expr": "snake_oldest_active_game_age_sec",
"legendFormat": "Oldest Active Game Age",
"range": true,
"refId": "A"
}
],
"title": "Oldest Active Game Age",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 24
},
"id": 16,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "11.0.0",
"targets": [
{
"editorMode": "code",
"expr": "snake_stale_game_timeout_sec",
"legendFormat": "Stale Timeout (Sec)",
"range": true,
"refId": "A"
}
],
"title": "Stale Timeout (Sec)",
"type": "stat"
} }
], ],
"refresh": "10s", "refresh": "10s",
@@ -912,6 +1213,6 @@
"timezone": "", "timezone": "",
"title": "Snake Performance", "title": "Snake Performance",
"uid": "snake-performance", "uid": "snake-performance",
"version": 2, "version": 3,
"weekStart": "" "weekStart": ""
} }
+61 -7
View File
@@ -33,6 +33,8 @@ class Server:
self.running_games:dict[str, GameBoard] = {} self.running_games:dict[str, GameBoard] = {}
self.game_move_counts:dict[str, int] = {} self.game_move_counts:dict[str, int] = {}
self.game_last_seen_unix:dict[str, int] = {}
self.stale_game_timeout_sec = self._get_stale_game_timeout_sec()
self.metrics = { self.metrics = {
'games_started': 0, 'games_started': 0,
'games_ended': 0, 'games_ended': 0,
@@ -49,9 +51,6 @@ class Server:
'start': 0, 'start': 0,
'move': 0, 'move': 0,
'end': 0, 'end': 0,
'cleanup': 0,
'metrics': 0,
'metrics_prometheus': 0,
}, },
'move_direction_counts': { 'move_direction_counts': {
'up': 0, 'up': 0,
@@ -65,6 +64,7 @@ class Server:
'last_game_start_unix': 0, 'last_game_start_unix': 0,
'last_game_end_unix': 0, 'last_game_end_unix': 0,
'last_move_unix': 0, 'last_move_unix': 0,
'games_stuck_removed': 0,
} }
self.logger = build_logger('Battlesnake', debug_env_var='DEBUG_SERVER') self.logger = build_logger('Battlesnake', debug_env_var='DEBUG_SERVER')
self.snake_version = self._get_snake_version() self.snake_version = self._get_snake_version()
@@ -86,6 +86,7 @@ class Server:
@self.app.post('/start') @self.app.post('/start')
async def on_start(): async def on_start():
self._record_http_request('start') self._record_http_request('start')
self._prune_stale_games()
game_state = await request.get_json() game_state = await request.get_json()
await self._create_game_board(game_state) await self._create_game_board(game_state)
await await_log(self.logger.info(f'GAME START: {game_state['game']}')) await await_log(self.logger.info(f'GAME START: {game_state['game']}'))
@@ -122,6 +123,7 @@ class Server:
@self.app.post('/end') @self.app.post('/end')
async def on_end(): async def on_end():
self._record_http_request('end') self._record_http_request('end')
self._prune_stale_games()
game_state = await request.get_json() game_state = await request.get_json()
if self.store_game_state: if self.store_game_state:
game_board = await self._get_game_board(game_state, end=True) game_board = await self._get_game_board(game_state, end=True)
@@ -130,7 +132,7 @@ class Server:
StorageLoader.build(self.storage_type), StorageLoader.build(self.storage_type),
file_path=os.path.join(self.data_path, 'data'), file_path=os.path.join(self.data_path, 'data'),
database=os.getenv('EDGEDB_DATABASE', None), database=os.getenv('EDGEDB_DATABASE', None),
tls_security=None tls_security=None,
) )
else: else:
await game_board.save( await game_board.save(
@@ -208,6 +210,13 @@ class Server:
return self.default_snake_config['version'] return self.default_snake_config['version']
return str(version) return str(version)
def _get_stale_game_timeout_sec(self) -> int:
value = os.getenv('SNAKE_STUCK_GAME_TIMEOUT_SEC', '180')
try:
return max(30, int(value))
except ValueError:
return 180
async def _create_game_board(self, game_state: dict): async def _create_game_board(self, game_state: dict):
game_id = game_state['game']['id'] game_id = game_state['game']['id']
new_game_board = GameBoard( new_game_board = GameBoard(
@@ -223,6 +232,7 @@ class Server:
self.running_games[game_id] = new_game_board self.running_games[game_id] = new_game_board
self.game_move_counts[game_id] = 0 self.game_move_counts[game_id] = 0
self.game_last_seen_unix[game_id] = int(time.time())
self.metrics['games_started'] += 1 self.metrics['games_started'] += 1
self.metrics['active_games_peak'] = max( self.metrics['active_games_peak'] = max(
self.metrics['active_games_peak'], self.metrics['active_games_peak'],
@@ -233,8 +243,9 @@ class Server:
def _delete_game_board(self, game_state:dict): def _delete_game_board(self, game_state:dict):
game_id = game_state['game']['id'] game_id = game_state['game']['id']
del self.running_games[game_id] self.running_games.pop(game_id, None)
self.game_move_counts.pop(game_id, None) self.game_move_counts.pop(game_id, None)
self.game_last_seen_unix.pop(game_id, None)
async def _get_game_board(self, game_state:dict, end:bool=False): async def _get_game_board(self, game_state:dict, end:bool=False):
game_id = game_state['game']['id'] game_id = game_state['game']['id']
@@ -248,6 +259,8 @@ class Server:
self.metrics['total_moves'] += 1 self.metrics['total_moves'] += 1
self.game_move_counts[game_id] = self.game_move_counts.get(game_id, 0) + 1 self.game_move_counts[game_id] = self.game_move_counts.get(game_id, 0) + 1
self.game_last_seen_unix[game_id] = int(time.time())
game_board.read_game_data(game_state) game_board.read_game_data(game_state)
if end: if end:
self._record_game_end(game_state) self._record_game_end(game_state)
@@ -262,6 +275,22 @@ class Server:
storage = StorageLoader.build(self.storage_type)() storage = StorageLoader.build(self.storage_type)()
return storage.cleanup() return storage.cleanup()
def _prune_stale_games(self):
if not self.running_games:
return
now = int(time.time())
stale_ids = [
game_id
for game_id, last_seen in self.game_last_seen_unix.items()
if now - last_seen >= self.stale_game_timeout_sec
]
for game_id in stale_ids:
self.running_games.pop(game_id, None)
self.game_move_counts.pop(game_id, None)
self.game_last_seen_unix.pop(game_id, None)
self.metrics['games_stuck_removed'] += 1
def _record_game_end(self, game_state: dict): def _record_game_end(self, game_state: dict):
self.metrics['games_ended'] += 1 self.metrics['games_ended'] += 1
self.metrics['last_game_end_unix'] = int(time.time()) self.metrics['last_game_end_unix'] = int(time.time())
@@ -286,6 +315,16 @@ class Server:
win_rate = self.metrics['wins'] / games_ended if games_ended else 0.0 win_rate = self.metrics['wins'] / games_ended if games_ended else 0.0
avg_move_ms = self.metrics['move_response_time_ms_total'] / total_moves if total_moves else 0.0 avg_move_ms = self.metrics['move_response_time_ms_total'] / total_moves if total_moves else 0.0
now = int(time.time())
oldest_active_age = 0
if self.game_last_seen_unix:
oldest_active_age = max(0, now - min(self.game_last_seen_unix.values()))
stale_candidates = sum(
1
for last_seen in self.game_last_seen_unix.values()
if now - last_seen >= self.stale_game_timeout_sec
)
return { return {
**self.metrics, **self.metrics,
'active_games': len(self.running_games), 'active_games': len(self.running_games),
@@ -295,6 +334,9 @@ class Server:
'avg_move_response_ms': round(avg_move_ms, 2), 'avg_move_response_ms': round(avg_move_ms, 2),
'http_requests_by_endpoint': dict(self.metrics['http_requests_by_endpoint']), 'http_requests_by_endpoint': dict(self.metrics['http_requests_by_endpoint']),
'move_direction_counts': dict(self.metrics['move_direction_counts']), 'move_direction_counts': dict(self.metrics['move_direction_counts']),
'oldest_active_game_age_sec': oldest_active_age,
'stale_game_timeout_sec': self.stale_game_timeout_sec,
'active_games_stale': stale_candidates,
} }
def _record_http_request(self, endpoint:str): def _record_http_request(self, endpoint:str):
@@ -343,10 +385,10 @@ class Server:
f'snake_http_requests_total {snapshot['http_requests_total']}', f'snake_http_requests_total {snapshot['http_requests_total']}',
'# HELP snake_move_response_ms_total Total move endpoint compute time in milliseconds.', '# HELP snake_move_response_ms_total Total move endpoint compute time in milliseconds.',
'# TYPE snake_move_response_ms_total counter', '# TYPE snake_move_response_ms_total counter',
f"snake_move_response_ms_total {round(snapshot['move_response_time_ms_total'], 3)}", f'snake_move_response_ms_total {round(snapshot['move_response_time_ms_total'], 3)}',
'# HELP snake_move_response_ms_max Maximum move endpoint compute time in milliseconds.', '# HELP snake_move_response_ms_max Maximum move endpoint compute time in milliseconds.',
'# TYPE snake_move_response_ms_max gauge', '# TYPE snake_move_response_ms_max gauge',
f"snake_move_response_ms_max {round(snapshot['move_response_time_ms_max'], 3)}", f'snake_move_response_ms_max {round(snapshot['move_response_time_ms_max'], 3)}',
'# HELP snake_avg_turns_per_game Average final turn per ended game.', '# HELP snake_avg_turns_per_game Average final turn per ended game.',
'# TYPE snake_avg_turns_per_game gauge', '# TYPE snake_avg_turns_per_game gauge',
f'snake_avg_turns_per_game {snapshot['avg_turns_per_game']}', f'snake_avg_turns_per_game {snapshot['avg_turns_per_game']}',
@@ -365,6 +407,18 @@ class Server:
'# HELP snake_last_move_unix Unix timestamp of most recent /move response.', '# HELP snake_last_move_unix Unix timestamp of most recent /move response.',
'# TYPE snake_last_move_unix gauge', '# TYPE snake_last_move_unix gauge',
f'snake_last_move_unix {snapshot['last_move_unix']}', f'snake_last_move_unix {snapshot['last_move_unix']}',
'# HELP snake_games_stuck_removed_total Active games auto-removed due to inactivity timeout.',
'# TYPE snake_games_stuck_removed_total counter',
f'snake_games_stuck_removed_total {snapshot['games_stuck_removed']}',
'# HELP snake_oldest_active_game_age_sec Age in seconds of the oldest active game.',
'# TYPE snake_oldest_active_game_age_sec gauge',
f'snake_oldest_active_game_age_sec {snapshot['oldest_active_game_age_sec']}',
'# HELP snake_stale_game_timeout_sec Configured inactivity timeout for stale games.',
'# TYPE snake_stale_game_timeout_sec gauge',
f'snake_stale_game_timeout_sec {snapshot['stale_game_timeout_sec']}',
'# HELP snake_active_games_stale Active games currently beyond stale timeout.',
'# TYPE snake_active_games_stale gauge',
f'snake_active_games_stale {snapshot['active_games_stale']}',
] ]
lines.extend([ lines.extend([