make Metrics Collector Class and auto send to redis when change happend
Build and Push Docker Container / build-and-push (push) Successful in 1m0s
Build and Push Docker Container / build-and-push (push) Successful in 1m0s
This commit is contained in:
+33
-227
@@ -8,6 +8,7 @@ from quart_common.web.logger import await_log
|
|||||||
from quart_common.web.logger import build_logger
|
from quart_common.web.logger import build_logger
|
||||||
|
|
||||||
from server.metrics.MetricsManager import MetricsManager
|
from server.metrics.MetricsManager import MetricsManager
|
||||||
|
from server.metrics.ServerMetricsCollector import ServerMetricsCollector
|
||||||
from server.storage.StorageLoader import StorageLoader
|
from server.storage.StorageLoader import StorageLoader
|
||||||
|
|
||||||
from quart import Quart, request, jsonify
|
from quart import Quart, request, jsonify
|
||||||
@@ -41,51 +42,24 @@ class Server:
|
|||||||
redis_url=game_state_redis_url,
|
redis_url=game_state_redis_url,
|
||||||
ttl_seconds=game_state_ttl_sec,
|
ttl_seconds=game_state_ttl_sec,
|
||||||
)
|
)
|
||||||
self.metrics_backend = (metrics_backend or 'memory').strip().lower()
|
metrics_backend_normalized = (metrics_backend or 'memory').strip().lower()
|
||||||
self.metrics_manager = MetricsManager(
|
self.stale_game_timeout_sec = self._get_stale_game_timeout_sec()
|
||||||
backend=self.metrics_backend,
|
|
||||||
redis_url=metrics_redis_url,
|
|
||||||
ttl_seconds=metrics_ttl_sec,
|
|
||||||
key_prefix=os.environ.get('METRICS_REDIS_KEY_PREFIX', 'snake:metrics:worker'),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.running_games:dict[str, GameBoard] = {}
|
self.running_games:dict[str, GameBoard] = {}
|
||||||
self.game_move_counts:dict[str, int] = {}
|
self.game_move_counts:dict[str, int] = {}
|
||||||
self.game_last_seen_unix:dict[str, int] = {}
|
self.game_last_seen_unix:dict[str, int] = {}
|
||||||
self.stale_game_timeout_sec = self._get_stale_game_timeout_sec()
|
self.metrics_collector = ServerMetricsCollector(
|
||||||
self.metrics = {
|
metrics_manager=MetricsManager(
|
||||||
'games_started': 0,
|
backend=metrics_backend_normalized,
|
||||||
'games_ended': 0,
|
redis_url=metrics_redis_url,
|
||||||
'wins': 0,
|
ttl_seconds=metrics_ttl_sec,
|
||||||
'losses': 0,
|
key_prefix=os.environ.get('METRICS_REDIS_KEY_PREFIX', 'snake:metrics:worker'),
|
||||||
'total_moves': 0,
|
),
|
||||||
'total_turns': 0,
|
game_state_local_cache=self.game_state_local_cache,
|
||||||
'max_turn': 0,
|
metrics_backend=metrics_backend_normalized,
|
||||||
'active_games_peak': 0,
|
stale_game_timeout_sec=self.stale_game_timeout_sec,
|
||||||
'games_autocreated': 0,
|
game_last_seen_unix=self.game_last_seen_unix,
|
||||||
'http_requests_total': 0,
|
game_move_counts=self.game_move_counts,
|
||||||
'http_requests_by_endpoint': {
|
)
|
||||||
'info': 0,
|
|
||||||
'start': 0,
|
|
||||||
'move': 0,
|
|
||||||
'end': 0,
|
|
||||||
},
|
|
||||||
'move_direction_counts': {
|
|
||||||
'up': 0,
|
|
||||||
'down': 0,
|
|
||||||
'left': 0,
|
|
||||||
'right': 0,
|
|
||||||
'unknown': 0,
|
|
||||||
},
|
|
||||||
'move_response_time_ms_total': 0.0,
|
|
||||||
'move_response_time_ms_max': 0.0,
|
|
||||||
'last_game_start_unix': 0,
|
|
||||||
'last_game_end_unix': 0,
|
|
||||||
'last_move_unix': 0,
|
|
||||||
'games_stuck_removed': 0,
|
|
||||||
'game_state_local_cache_enabled': bool(self.game_state_local_cache),
|
|
||||||
'metrics_backend': self.metrics_backend,
|
|
||||||
}
|
|
||||||
self.logger = build_logger('Battlesnake', debug_env_var='DEBUG_SERVER')
|
self.logger = build_logger('Battlesnake', debug_env_var='DEBUG_SERVER')
|
||||||
self.snake_version = self._get_snake_version()
|
self.snake_version = self._get_snake_version()
|
||||||
|
|
||||||
@@ -96,7 +70,7 @@ class Server:
|
|||||||
# TIP: If you open your Battlesnake URL in a browser you should see this data
|
# TIP: If you open your Battlesnake URL in a browser you should see this data
|
||||||
@self.app.get('/')
|
@self.app.get('/')
|
||||||
async def on_info():
|
async def on_info():
|
||||||
self._record_http_request('info')
|
self.metrics_collector.record_http_request('info')
|
||||||
snake_config = await self._read_json_config_or_create()
|
snake_config = await self._read_json_config_or_create()
|
||||||
|
|
||||||
await await_log(self.logger.info(f'INFO Snake: {snake_config}'))
|
await await_log(self.logger.info(f'INFO Snake: {snake_config}'))
|
||||||
@@ -105,8 +79,8 @@ class Server:
|
|||||||
# start is called when your Battlesnake begins a game
|
# start is called when your Battlesnake begins a game
|
||||||
@self.app.post('/start')
|
@self.app.post('/start')
|
||||||
async def on_start():
|
async def on_start():
|
||||||
self._record_http_request('start')
|
self.metrics_collector.record_http_request('start')
|
||||||
self._prune_stale_games()
|
await self._prune_stale_games()
|
||||||
game_state = await request.get_json()
|
game_state = await request.get_json()
|
||||||
await self._create_game_board(game_state)
|
await self._create_game_board(game_state)
|
||||||
await await_log(self.logger.info(f'GAME START: {game_state['game']}'))
|
await await_log(self.logger.info(f'GAME START: {game_state['game']}'))
|
||||||
@@ -115,25 +89,14 @@ class Server:
|
|||||||
# move is called when your Battlesnake game is running game
|
# move is called when your Battlesnake game is running game
|
||||||
@self.app.post('/move')
|
@self.app.post('/move')
|
||||||
async def on_move():
|
async def on_move():
|
||||||
self._record_http_request('move')
|
self.metrics_collector.record_http_request('move')
|
||||||
game_state = await request.get_json()
|
game_state = await request.get_json()
|
||||||
move_started = time.perf_counter()
|
move_started = time.perf_counter()
|
||||||
game_board = cast(GameBoard, await self._get_game_board(game_state))
|
game_board = cast(GameBoard, await self._get_game_board(game_state))
|
||||||
next_move = game_board.snake_neat_make_a_move()
|
next_move = game_board.snake_neat_make_a_move()
|
||||||
await self._persist_game_board(game_state['game']['id'], game_board)
|
await self._persist_game_board(game_state['game']['id'], game_board)
|
||||||
elapsed_ms = (time.perf_counter() - move_started) * 1000.0
|
elapsed_ms = (time.perf_counter() - move_started) * 1000.0
|
||||||
self.metrics['move_response_time_ms_total'] += elapsed_ms
|
await self.metrics_collector.record_move(next_move, elapsed_ms)
|
||||||
self.metrics['move_response_time_ms_max'] = max(
|
|
||||||
self.metrics['move_response_time_ms_max'],
|
|
||||||
elapsed_ms,
|
|
||||||
)
|
|
||||||
|
|
||||||
move_counts = self.metrics['move_direction_counts']
|
|
||||||
if next_move in move_counts:
|
|
||||||
move_counts[next_move] += 1
|
|
||||||
else:
|
|
||||||
move_counts['unknown'] += 1
|
|
||||||
self.metrics['last_move_unix'] = int(time.time())
|
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
await await_log(self.logger.debug(f'TURN: {game_state['turn']:3}, MOVE: {next_move:5}'))
|
await await_log(self.logger.debug(f'TURN: {game_state['turn']:3}, MOVE: {next_move:5}'))
|
||||||
@@ -143,8 +106,8 @@ class Server:
|
|||||||
# end is called when your Battlesnake finishes a game
|
# end is called when your Battlesnake finishes a game
|
||||||
@self.app.post('/end')
|
@self.app.post('/end')
|
||||||
async def on_end():
|
async def on_end():
|
||||||
self._record_http_request('end')
|
self.metrics_collector.record_http_request('end')
|
||||||
self._prune_stale_games()
|
await self._prune_stale_games()
|
||||||
game_state = await request.get_json()
|
game_state = await request.get_json()
|
||||||
if self.store_game_state:
|
if self.store_game_state:
|
||||||
game_board = cast(GameBoard, await self._get_game_board(game_state, end=True))
|
game_board = cast(GameBoard, await self._get_game_board(game_state, end=True))
|
||||||
@@ -174,7 +137,7 @@ class Server:
|
|||||||
@self.app.after_serving
|
@self.app.after_serving
|
||||||
async def shutdown_state_storage():
|
async def shutdown_state_storage():
|
||||||
await self.game_state_store.close()
|
await self.game_state_store.close()
|
||||||
await self._close_metrics_store()
|
await self.metrics_collector.close()
|
||||||
|
|
||||||
@self.app.get('/cleanup')
|
@self.app.get('/cleanup')
|
||||||
async def cleanup():
|
async def cleanup():
|
||||||
@@ -183,13 +146,14 @@ class Server:
|
|||||||
|
|
||||||
@self.app.get('/metrics')
|
@self.app.get('/metrics')
|
||||||
async def metrics():
|
async def metrics():
|
||||||
return jsonify(await self._build_metrics())
|
snapshot = await self.metrics_collector.build_snapshot(self.game_last_seen_unix, self.game_move_counts)
|
||||||
|
return jsonify(snapshot)
|
||||||
|
|
||||||
@self.app.get('/metrics/prometheus')
|
@self.app.get('/metrics/prometheus')
|
||||||
async def metrics_prometheus():
|
async def metrics_prometheus():
|
||||||
snapshot = await self._build_metrics()
|
snapshot = await self.metrics_collector.build_snapshot(self.game_last_seen_unix, self.game_move_counts)
|
||||||
return (
|
return (
|
||||||
self._build_prometheus_metrics(snapshot),
|
self.metrics_collector.build_prometheus_metrics(snapshot),
|
||||||
200,
|
200,
|
||||||
{'Content-Type': 'text/plain; version=0.0.4; charset=utf-8'},
|
{'Content-Type': 'text/plain; version=0.0.4; charset=utf-8'},
|
||||||
)
|
)
|
||||||
@@ -262,12 +226,7 @@ class Server:
|
|||||||
await self.game_state_store.save(game_id, new_game_board)
|
await self.game_state_store.save(game_id, new_game_board)
|
||||||
self.game_move_counts[game_id] = 0
|
self.game_move_counts[game_id] = 0
|
||||||
self.game_last_seen_unix[game_id] = int(time.time())
|
self.game_last_seen_unix[game_id] = int(time.time())
|
||||||
self.metrics['games_started'] += 1
|
await self.metrics_collector.record_game_started(len(self.game_last_seen_unix))
|
||||||
self.metrics['active_games_peak'] = max(
|
|
||||||
self.metrics['active_games_peak'],
|
|
||||||
len(self.game_last_seen_unix),
|
|
||||||
)
|
|
||||||
self.metrics['last_game_start_unix'] = int(time.time())
|
|
||||||
return new_game_board
|
return new_game_board
|
||||||
|
|
||||||
async def _persist_game_board(self, game_id:str, game_board:GameBoard):
|
async def _persist_game_board(self, game_id:str, game_board:GameBoard):
|
||||||
@@ -295,17 +254,16 @@ class Server:
|
|||||||
self.running_games[game_id] = game_board
|
self.running_games[game_id] = game_board
|
||||||
else:
|
else:
|
||||||
game_board = await self._create_game_board(game_state)
|
game_board = await self._create_game_board(game_state)
|
||||||
self.metrics['games_autocreated'] += 1
|
await self.metrics_collector.record_game_autocreated()
|
||||||
|
|
||||||
if not end:
|
if not end:
|
||||||
self.metrics['total_moves'] += 1
|
|
||||||
self.game_move_counts[game_id] = self.game_move_counts.get(game_id, 0) + 1
|
self.game_move_counts[game_id] = self.game_move_counts.get(game_id, 0) + 1
|
||||||
|
|
||||||
self.game_last_seen_unix[game_id] = int(time.time())
|
self.game_last_seen_unix[game_id] = int(time.time())
|
||||||
|
|
||||||
game_board.read_game_data(game_state)
|
game_board.read_game_data(game_state)
|
||||||
if end:
|
if end:
|
||||||
self._record_game_end(game_state)
|
await self.metrics_collector.record_game_end(game_state)
|
||||||
game_board.end_game(game_state)
|
game_board.end_game(game_state)
|
||||||
await self._persist_game_board(game_id, game_board)
|
await self._persist_game_board(game_id, game_board)
|
||||||
|
|
||||||
@@ -318,7 +276,7 @@ class Server:
|
|||||||
storage = StorageLoader.build(self.storage_type)()
|
storage = StorageLoader.build(self.storage_type)()
|
||||||
return storage.cleanup()
|
return storage.cleanup()
|
||||||
|
|
||||||
def _prune_stale_games(self):
|
async def _prune_stale_games(self):
|
||||||
if not self.game_last_seen_unix:
|
if not self.game_last_seen_unix:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -332,156 +290,4 @@ class Server:
|
|||||||
self.running_games.pop(game_id, None)
|
self.running_games.pop(game_id, None)
|
||||||
self.game_move_counts.pop(game_id, None)
|
self.game_move_counts.pop(game_id, None)
|
||||||
self.game_last_seen_unix.pop(game_id, None)
|
self.game_last_seen_unix.pop(game_id, None)
|
||||||
self.metrics['games_stuck_removed'] += 1
|
await self.metrics_collector.record_stuck_removed()
|
||||||
|
|
||||||
def _record_game_end(self, game_state:dict):
|
|
||||||
self.metrics['games_ended'] += 1
|
|
||||||
self.metrics['last_game_end_unix'] = int(time.time())
|
|
||||||
|
|
||||||
final_turn = int(game_state.get('turn', 0))
|
|
||||||
self.metrics['total_turns'] += final_turn
|
|
||||||
self.metrics['max_turn'] = max(self.metrics['max_turn'], final_turn)
|
|
||||||
|
|
||||||
you_id = game_state.get('you', {}).get('id')
|
|
||||||
alive_snakes = game_state.get('board', {}).get('snakes', [])
|
|
||||||
alive_ids = {snake.get('id') for snake in alive_snakes}
|
|
||||||
|
|
||||||
if you_id and you_id in alive_ids:
|
|
||||||
self.metrics['wins'] += 1
|
|
||||||
else:
|
|
||||||
self.metrics['losses'] += 1
|
|
||||||
|
|
||||||
def _build_local_metrics(self) -> dict:
|
|
||||||
games_ended = self.metrics['games_ended']
|
|
||||||
total_moves = self.metrics['total_moves']
|
|
||||||
avg_turns = self.metrics['total_turns'] / games_ended if games_ended else 0.0
|
|
||||||
win_rate = self.metrics['wins'] / games_ended if games_ended else 0.0
|
|
||||||
avg_move_ms = (self.metrics['move_response_time_ms_total'] / total_moves if total_moves else 0.0)
|
|
||||||
|
|
||||||
now = int(time.time())
|
|
||||||
oldest_active_age = 0
|
|
||||||
if self.game_last_seen_unix:
|
|
||||||
oldest_active_age = max(0, now - min(self.game_last_seen_unix.values()))
|
|
||||||
stale_candidates = sum(
|
|
||||||
1
|
|
||||||
for last_seen in self.game_last_seen_unix.values()
|
|
||||||
if now - last_seen >= self.stale_game_timeout_sec
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
**self.metrics,
|
|
||||||
'active_games': len(self.game_last_seen_unix),
|
|
||||||
'tracked_games': len(self.game_move_counts),
|
|
||||||
'avg_turns_per_game': round(avg_turns, 2),
|
|
||||||
'win_rate': round(win_rate, 4),
|
|
||||||
'avg_move_response_ms': round(avg_move_ms, 2),
|
|
||||||
'http_requests_by_endpoint': dict(self.metrics['http_requests_by_endpoint']),
|
|
||||||
'move_direction_counts': dict(self.metrics['move_direction_counts']),
|
|
||||||
'oldest_active_game_age_sec': oldest_active_age,
|
|
||||||
'stale_game_timeout_sec': self.stale_game_timeout_sec,
|
|
||||||
'active_games_stale': stale_candidates,
|
|
||||||
}
|
|
||||||
|
|
||||||
def _record_http_request(self, endpoint: str):
|
|
||||||
self.metrics['http_requests_total'] += 1
|
|
||||||
endpoint_counts = self.metrics['http_requests_by_endpoint']
|
|
||||||
endpoint_counts[endpoint] = endpoint_counts.get(endpoint, 0) + 1
|
|
||||||
|
|
||||||
async def _build_metrics(self) -> dict:
|
|
||||||
local_snapshot = self._build_local_metrics()
|
|
||||||
return await self.metrics_manager.snapshot(local_snapshot)
|
|
||||||
|
|
||||||
def _build_prometheus_metrics(self, snapshot: dict) -> str:
|
|
||||||
lines = [
|
|
||||||
'# HELP snake_games_started_total Total games started by snake server.',
|
|
||||||
'# TYPE snake_games_started_total counter',
|
|
||||||
f'snake_games_started_total {snapshot['games_started']}',
|
|
||||||
'# HELP snake_games_ended_total Total games ended by snake server.',
|
|
||||||
'# TYPE snake_games_ended_total counter',
|
|
||||||
f'snake_games_ended_total {snapshot['games_ended']}',
|
|
||||||
'# HELP snake_wins_total Total games won by this snake.',
|
|
||||||
'# TYPE snake_wins_total counter',
|
|
||||||
f'snake_wins_total {snapshot['wins']}',
|
|
||||||
'# HELP snake_losses_total Total games lost by this snake.',
|
|
||||||
'# TYPE snake_losses_total counter',
|
|
||||||
f'snake_losses_total {snapshot['losses']}',
|
|
||||||
'# HELP snake_moves_total Total move decisions served by /move.',
|
|
||||||
'# TYPE snake_moves_total counter',
|
|
||||||
f'snake_moves_total {snapshot['total_moves']}',
|
|
||||||
'# HELP snake_turns_total Total turns across all ended games.',
|
|
||||||
'# TYPE snake_turns_total counter',
|
|
||||||
f'snake_turns_total {snapshot['total_turns']}',
|
|
||||||
'# HELP snake_active_games Currently active games in memory.',
|
|
||||||
'# TYPE snake_active_games gauge',
|
|
||||||
f'snake_active_games {snapshot['active_games']}',
|
|
||||||
'# HELP snake_tracked_games Currently tracked game IDs for move counters.',
|
|
||||||
'# TYPE snake_tracked_games gauge',
|
|
||||||
f'snake_tracked_games {snapshot['tracked_games']}',
|
|
||||||
'# HELP snake_max_turn Highest final turn seen in an ended game.',
|
|
||||||
'# TYPE snake_max_turn gauge',
|
|
||||||
f'snake_max_turn {snapshot['max_turn']}',
|
|
||||||
'# HELP snake_active_games_peak Highest active game count observed.',
|
|
||||||
'# TYPE snake_active_games_peak gauge',
|
|
||||||
f'snake_active_games_peak {snapshot['active_games_peak']}',
|
|
||||||
'# HELP snake_games_autocreated_total Games created on /move or /end due to missing /start.',
|
|
||||||
'# TYPE snake_games_autocreated_total counter',
|
|
||||||
f'snake_games_autocreated_total {snapshot['games_autocreated']}',
|
|
||||||
'# HELP snake_http_requests_total Total HTTP requests handled by this process.',
|
|
||||||
'# TYPE snake_http_requests_total counter',
|
|
||||||
f'snake_http_requests_total {snapshot['http_requests_total']}',
|
|
||||||
'# HELP snake_move_response_ms_total Total move endpoint compute time in milliseconds.',
|
|
||||||
'# TYPE snake_move_response_ms_total counter',
|
|
||||||
f'snake_move_response_ms_total {round(snapshot['move_response_time_ms_total'], 3)}',
|
|
||||||
'# HELP snake_move_response_ms_max Maximum move endpoint compute time in milliseconds.',
|
|
||||||
'# TYPE snake_move_response_ms_max gauge',
|
|
||||||
f'snake_move_response_ms_max {round(snapshot['move_response_time_ms_max'], 3)}',
|
|
||||||
'# HELP snake_avg_turns_per_game Average final turn per ended game.',
|
|
||||||
'# TYPE snake_avg_turns_per_game gauge',
|
|
||||||
f'snake_avg_turns_per_game {snapshot['avg_turns_per_game']}',
|
|
||||||
'# HELP snake_avg_move_response_ms Average move endpoint compute time in milliseconds.',
|
|
||||||
'# TYPE snake_avg_move_response_ms gauge',
|
|
||||||
f'snake_avg_move_response_ms {snapshot['avg_move_response_ms']}',
|
|
||||||
'# HELP snake_win_rate Win ratio from ended games (0.0 - 1.0).',
|
|
||||||
'# TYPE snake_win_rate gauge',
|
|
||||||
f'snake_win_rate {snapshot['win_rate']}',
|
|
||||||
'# HELP snake_last_game_start_unix Unix timestamp of most recent /start request.',
|
|
||||||
'# TYPE snake_last_game_start_unix gauge',
|
|
||||||
f'snake_last_game_start_unix {snapshot['last_game_start_unix']}',
|
|
||||||
'# HELP snake_last_game_end_unix Unix timestamp of most recent /end request.',
|
|
||||||
'# TYPE snake_last_game_end_unix gauge',
|
|
||||||
f'snake_last_game_end_unix {snapshot['last_game_end_unix']}',
|
|
||||||
'# HELP snake_last_move_unix Unix timestamp of most recent /move response.',
|
|
||||||
'# TYPE snake_last_move_unix gauge',
|
|
||||||
f'snake_last_move_unix {snapshot['last_move_unix']}',
|
|
||||||
'# HELP snake_games_stuck_removed_total Active games auto-removed due to inactivity timeout.',
|
|
||||||
'# TYPE snake_games_stuck_removed_total counter',
|
|
||||||
f'snake_games_stuck_removed_total {snapshot['games_stuck_removed']}',
|
|
||||||
'# HELP snake_oldest_active_game_age_sec Age in seconds of the oldest active game.',
|
|
||||||
'# TYPE snake_oldest_active_game_age_sec gauge',
|
|
||||||
f'snake_oldest_active_game_age_sec {snapshot['oldest_active_game_age_sec']}',
|
|
||||||
'# HELP snake_stale_game_timeout_sec Configured inactivity timeout for stale games.',
|
|
||||||
'# TYPE snake_stale_game_timeout_sec gauge',
|
|
||||||
f'snake_stale_game_timeout_sec {snapshot['stale_game_timeout_sec']}',
|
|
||||||
'# HELP snake_active_games_stale Active games currently beyond stale timeout.',
|
|
||||||
'# TYPE snake_active_games_stale gauge',
|
|
||||||
f'snake_active_games_stale {snapshot['active_games_stale']}',
|
|
||||||
]
|
|
||||||
|
|
||||||
lines.extend([
|
|
||||||
'# HELP snake_http_requests_by_endpoint_total Requests served grouped by endpoint.',
|
|
||||||
'# TYPE snake_http_requests_by_endpoint_total counter',
|
|
||||||
])
|
|
||||||
for endpoint, count in snapshot['http_requests_by_endpoint'].items():
|
|
||||||
lines.append(f'snake_http_requests_by_endpoint_total{{endpoint="{endpoint}"}} {count}')
|
|
||||||
|
|
||||||
lines.extend([
|
|
||||||
'# HELP snake_moves_by_direction_total Move responses grouped by direction.',
|
|
||||||
'# TYPE snake_moves_by_direction_total counter',
|
|
||||||
])
|
|
||||||
for direction, count in snapshot['move_direction_counts'].items():
|
|
||||||
lines.append(f'snake_moves_by_direction_total{{direction="{direction}"}} {count}')
|
|
||||||
|
|
||||||
return '\n'.join(lines) + '\n'
|
|
||||||
|
|
||||||
async def _close_metrics_store(self) -> None:
|
|
||||||
await self.metrics_manager.close()
|
|
||||||
|
|||||||
@@ -13,6 +13,9 @@ class MetricsManager:
|
|||||||
key_prefix=key_prefix,
|
key_prefix=key_prefix,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def publish_only(self, snapshot:dict) -> None:
|
||||||
|
await self.store.publish(self.worker_id, snapshot)
|
||||||
|
|
||||||
async def snapshot(self, local_snapshot:dict) -> dict:
|
async def snapshot(self, local_snapshot:dict) -> dict:
|
||||||
await self.store.publish(self.worker_id, local_snapshot)
|
await self.store.publish(self.worker_id, local_snapshot)
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,239 @@
|
|||||||
|
import time
|
||||||
|
|
||||||
|
from server.metrics.MetricsManager import MetricsManager
|
||||||
|
|
||||||
|
|
||||||
|
class ServerMetricsCollector:
|
||||||
|
def __init__(self, metrics_manager:MetricsManager, game_state_local_cache:bool, metrics_backend:str, stale_game_timeout_sec:int, game_last_seen_unix:dict, game_move_counts:dict,):
|
||||||
|
self._manager = metrics_manager
|
||||||
|
self._stale_game_timeout_sec = stale_game_timeout_sec
|
||||||
|
self._game_last_seen_unix = game_last_seen_unix
|
||||||
|
self._game_move_counts = game_move_counts
|
||||||
|
self._metrics = {
|
||||||
|
'games_started': 0,
|
||||||
|
'games_ended': 0,
|
||||||
|
'wins': 0,
|
||||||
|
'losses': 0,
|
||||||
|
'total_moves': 0,
|
||||||
|
'total_turns': 0,
|
||||||
|
'max_turn': 0,
|
||||||
|
'active_games_peak': 0,
|
||||||
|
'games_autocreated': 0,
|
||||||
|
'http_requests_total': 0,
|
||||||
|
'http_requests_by_endpoint': {
|
||||||
|
'info': 0,
|
||||||
|
'start': 0,
|
||||||
|
'move': 0,
|
||||||
|
'end': 0,
|
||||||
|
},
|
||||||
|
'move_direction_counts': {
|
||||||
|
'up': 0,
|
||||||
|
'down': 0,
|
||||||
|
'left': 0,
|
||||||
|
'right': 0,
|
||||||
|
'unknown': 0,
|
||||||
|
},
|
||||||
|
'move_response_time_ms_total': 0.0,
|
||||||
|
'move_response_time_ms_max': 0.0,
|
||||||
|
'last_game_start_unix': 0,
|
||||||
|
'last_game_end_unix': 0,
|
||||||
|
'last_move_unix': 0,
|
||||||
|
'games_stuck_removed': 0,
|
||||||
|
'game_state_local_cache_enabled': bool(game_state_local_cache),
|
||||||
|
'metrics_backend': metrics_backend,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── internal ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def _auto_publish(self) -> None:
|
||||||
|
snapshot = self.build_local_snapshot(self._game_last_seen_unix, self._game_move_counts)
|
||||||
|
await self._manager.publish_only(snapshot)
|
||||||
|
|
||||||
|
# ── record helpers ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def record_http_request(self, endpoint:str) -> None:
|
||||||
|
self._metrics['http_requests_total'] += 1
|
||||||
|
endpoint_counts = self._metrics['http_requests_by_endpoint']
|
||||||
|
endpoint_counts[endpoint] = endpoint_counts.get(endpoint, 0) + 1
|
||||||
|
|
||||||
|
async def record_game_started(self, active_count:int) -> None:
|
||||||
|
self._metrics['games_started'] += 1
|
||||||
|
self._metrics['active_games_peak'] = max(
|
||||||
|
self._metrics['active_games_peak'],
|
||||||
|
active_count,
|
||||||
|
)
|
||||||
|
self._metrics['last_game_start_unix'] = int(time.time())
|
||||||
|
await self._auto_publish()
|
||||||
|
|
||||||
|
async def record_game_autocreated(self) -> None:
|
||||||
|
self._metrics['games_autocreated'] += 1
|
||||||
|
await self._auto_publish()
|
||||||
|
|
||||||
|
async def record_move(self, direction:str, elapsed_ms:float) -> None:
|
||||||
|
self._metrics['total_moves'] += 1
|
||||||
|
self._metrics['move_response_time_ms_total'] += elapsed_ms
|
||||||
|
self._metrics['move_response_time_ms_max'] = max(
|
||||||
|
self._metrics['move_response_time_ms_max'],
|
||||||
|
elapsed_ms,
|
||||||
|
)
|
||||||
|
move_counts = self._metrics['move_direction_counts']
|
||||||
|
if direction in move_counts:
|
||||||
|
move_counts[direction] += 1
|
||||||
|
else:
|
||||||
|
move_counts['unknown'] += 1
|
||||||
|
self._metrics['last_move_unix'] = int(time.time())
|
||||||
|
await self._auto_publish()
|
||||||
|
|
||||||
|
async def record_game_end(self, game_state:dict) -> None:
|
||||||
|
self._metrics['games_ended'] += 1
|
||||||
|
self._metrics['last_game_end_unix'] = int(time.time())
|
||||||
|
|
||||||
|
final_turn = int(game_state.get('turn', 0))
|
||||||
|
self._metrics['total_turns'] += final_turn
|
||||||
|
self._metrics['max_turn'] = max(self._metrics['max_turn'], final_turn)
|
||||||
|
|
||||||
|
you_id = game_state.get('you', {}).get('id')
|
||||||
|
alive_ids = {s.get('id') for s in game_state.get('board', {}).get('snakes', [])}
|
||||||
|
if you_id and you_id in alive_ids:
|
||||||
|
self._metrics['wins'] += 1
|
||||||
|
else:
|
||||||
|
self._metrics['losses'] += 1
|
||||||
|
await self._auto_publish()
|
||||||
|
|
||||||
|
async def record_stuck_removed(self) -> None:
|
||||||
|
self._metrics['games_stuck_removed'] += 1
|
||||||
|
await self._auto_publish()
|
||||||
|
|
||||||
|
# ── snapshot builders ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def build_local_snapshot(self, game_last_seen_unix:dict, game_move_counts:dict) -> dict:
|
||||||
|
games_ended = self._metrics['games_ended']
|
||||||
|
total_moves = self._metrics['total_moves']
|
||||||
|
avg_turns = self._metrics['total_turns'] / games_ended if games_ended else 0.0
|
||||||
|
win_rate = self._metrics['wins'] / games_ended if games_ended else 0.0
|
||||||
|
avg_move_ms = (
|
||||||
|
self._metrics['move_response_time_ms_total'] / total_moves if total_moves else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
now = int(time.time())
|
||||||
|
oldest_active_age = (
|
||||||
|
max(0, now - min(game_last_seen_unix.values())) if game_last_seen_unix else 0
|
||||||
|
)
|
||||||
|
stale_candidates = sum(
|
||||||
|
1
|
||||||
|
for last_seen in game_last_seen_unix.values()
|
||||||
|
if now - last_seen >= self._stale_game_timeout_sec
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
**self._metrics,
|
||||||
|
'active_games': len(game_last_seen_unix),
|
||||||
|
'tracked_games': len(game_move_counts),
|
||||||
|
'avg_turns_per_game': round(avg_turns, 2),
|
||||||
|
'win_rate': round(win_rate, 4),
|
||||||
|
'avg_move_response_ms': round(avg_move_ms, 2),
|
||||||
|
'http_requests_by_endpoint': dict(self._metrics['http_requests_by_endpoint']),
|
||||||
|
'move_direction_counts': dict(self._metrics['move_direction_counts']),
|
||||||
|
'oldest_active_game_age_sec': oldest_active_age,
|
||||||
|
'stale_game_timeout_sec': self._stale_game_timeout_sec,
|
||||||
|
'active_games_stale': stale_candidates,
|
||||||
|
}
|
||||||
|
|
||||||
|
async def build_snapshot(self, game_last_seen_unix:dict, game_move_counts:dict) -> dict:
|
||||||
|
local_snapshot = self.build_local_snapshot(game_last_seen_unix, game_move_counts)
|
||||||
|
return await self._manager.snapshot(local_snapshot)
|
||||||
|
|
||||||
|
def build_prometheus_metrics(self, snapshot:dict) -> str:
|
||||||
|
lines = [
|
||||||
|
'# HELP snake_games_started_total Total games started by snake server.',
|
||||||
|
'# TYPE snake_games_started_total counter',
|
||||||
|
f'snake_games_started_total {snapshot["games_started"]}',
|
||||||
|
'# HELP snake_games_ended_total Total games ended by snake server.',
|
||||||
|
'# TYPE snake_games_ended_total counter',
|
||||||
|
f'snake_games_ended_total {snapshot["games_ended"]}',
|
||||||
|
'# HELP snake_wins_total Total games won by this snake.',
|
||||||
|
'# TYPE snake_wins_total counter',
|
||||||
|
f'snake_wins_total {snapshot["wins"]}',
|
||||||
|
'# HELP snake_losses_total Total games lost by this snake.',
|
||||||
|
'# TYPE snake_losses_total counter',
|
||||||
|
f'snake_losses_total {snapshot["losses"]}',
|
||||||
|
'# HELP snake_moves_total Total move decisions served by /move.',
|
||||||
|
'# TYPE snake_moves_total counter',
|
||||||
|
f'snake_moves_total {snapshot["total_moves"]}',
|
||||||
|
'# HELP snake_turns_total Total turns across all ended games.',
|
||||||
|
'# TYPE snake_turns_total counter',
|
||||||
|
f'snake_turns_total {snapshot["total_turns"]}',
|
||||||
|
'# HELP snake_active_games Currently active games in memory.',
|
||||||
|
'# TYPE snake_active_games gauge',
|
||||||
|
f'snake_active_games {snapshot["active_games"]}',
|
||||||
|
'# HELP snake_tracked_games Currently tracked game IDs for move counters.',
|
||||||
|
'# TYPE snake_tracked_games gauge',
|
||||||
|
f'snake_tracked_games {snapshot["tracked_games"]}',
|
||||||
|
'# HELP snake_max_turn Highest final turn seen in an ended game.',
|
||||||
|
'# TYPE snake_max_turn gauge',
|
||||||
|
f'snake_max_turn {snapshot["max_turn"]}',
|
||||||
|
'# HELP snake_active_games_peak Highest active game count observed.',
|
||||||
|
'# TYPE snake_active_games_peak gauge',
|
||||||
|
f'snake_active_games_peak {snapshot["active_games_peak"]}',
|
||||||
|
'# HELP snake_games_autocreated_total Games created on /move or /end due to missing /start.',
|
||||||
|
'# TYPE snake_games_autocreated_total counter',
|
||||||
|
f'snake_games_autocreated_total {snapshot["games_autocreated"]}',
|
||||||
|
'# HELP snake_http_requests_total Total HTTP requests handled by this process.',
|
||||||
|
'# TYPE snake_http_requests_total counter',
|
||||||
|
f'snake_http_requests_total {snapshot["http_requests_total"]}',
|
||||||
|
'# HELP snake_move_response_ms_total Total move endpoint compute time in milliseconds.',
|
||||||
|
'# TYPE snake_move_response_ms_total counter',
|
||||||
|
f'snake_move_response_ms_total {round(snapshot["move_response_time_ms_total"], 3)}',
|
||||||
|
'# HELP snake_move_response_ms_max Maximum move endpoint compute time in milliseconds.',
|
||||||
|
'# TYPE snake_move_response_ms_max gauge',
|
||||||
|
f'snake_move_response_ms_max {round(snapshot["move_response_time_ms_max"], 3)}',
|
||||||
|
'# HELP snake_avg_turns_per_game Average final turn per ended game.',
|
||||||
|
'# TYPE snake_avg_turns_per_game gauge',
|
||||||
|
f'snake_avg_turns_per_game {snapshot["avg_turns_per_game"]}',
|
||||||
|
'# HELP snake_avg_move_response_ms Average move endpoint compute time in milliseconds.',
|
||||||
|
'# TYPE snake_avg_move_response_ms gauge',
|
||||||
|
f'snake_avg_move_response_ms {snapshot["avg_move_response_ms"]}',
|
||||||
|
'# HELP snake_win_rate Win ratio from ended games (0.0 - 1.0).',
|
||||||
|
'# TYPE snake_win_rate gauge',
|
||||||
|
f'snake_win_rate {snapshot["win_rate"]}',
|
||||||
|
'# HELP snake_last_game_start_unix Unix timestamp of most recent /start request.',
|
||||||
|
'# TYPE snake_last_game_start_unix gauge',
|
||||||
|
f'snake_last_game_start_unix {snapshot["last_game_start_unix"]}',
|
||||||
|
'# HELP snake_last_game_end_unix Unix timestamp of most recent /end request.',
|
||||||
|
'# TYPE snake_last_game_end_unix gauge',
|
||||||
|
f'snake_last_game_end_unix {snapshot["last_game_end_unix"]}',
|
||||||
|
'# HELP snake_last_move_unix Unix timestamp of most recent /move response.',
|
||||||
|
'# TYPE snake_last_move_unix gauge',
|
||||||
|
f'snake_last_move_unix {snapshot["last_move_unix"]}',
|
||||||
|
'# HELP snake_games_stuck_removed_total Active games auto-removed due to inactivity timeout.',
|
||||||
|
'# TYPE snake_games_stuck_removed_total counter',
|
||||||
|
f'snake_games_stuck_removed_total {snapshot["games_stuck_removed"]}',
|
||||||
|
'# HELP snake_oldest_active_game_age_sec Age in seconds of the oldest active game.',
|
||||||
|
'# TYPE snake_oldest_active_game_age_sec gauge',
|
||||||
|
f'snake_oldest_active_game_age_sec {snapshot["oldest_active_game_age_sec"]}',
|
||||||
|
'# HELP snake_stale_game_timeout_sec Configured inactivity timeout for stale games.',
|
||||||
|
'# TYPE snake_stale_game_timeout_sec gauge',
|
||||||
|
f'snake_stale_game_timeout_sec {snapshot["stale_game_timeout_sec"]}',
|
||||||
|
'# HELP snake_active_games_stale Active games currently beyond stale timeout.',
|
||||||
|
'# TYPE snake_active_games_stale gauge',
|
||||||
|
f'snake_active_games_stale {snapshot["active_games_stale"]}',
|
||||||
|
]
|
||||||
|
|
||||||
|
lines.extend([
|
||||||
|
'# HELP snake_http_requests_by_endpoint_total Requests served grouped by endpoint.',
|
||||||
|
'# TYPE snake_http_requests_by_endpoint_total counter',
|
||||||
|
])
|
||||||
|
for endpoint, count in snapshot['http_requests_by_endpoint'].items():
|
||||||
|
lines.append(f'snake_http_requests_by_endpoint_total{{endpoint="{endpoint}"}} {count}')
|
||||||
|
|
||||||
|
lines.extend([
|
||||||
|
'# HELP snake_moves_by_direction_total Move responses grouped by direction.',
|
||||||
|
'# TYPE snake_moves_by_direction_total counter',
|
||||||
|
])
|
||||||
|
for direction, count in snapshot['move_direction_counts'].items():
|
||||||
|
lines.append(f'snake_moves_by_direction_total{{direction="{direction}"}} {count}')
|
||||||
|
|
||||||
|
return '\n'.join(lines) + '\n'
|
||||||
|
|
||||||
|
async def close(self) -> None:
|
||||||
|
await self._manager.close()
|
||||||
Reference in New Issue
Block a user