import argparse import glob import json import re from collections import Counter, defaultdict from datetime import datetime from pathlib import Path class DatasetStats: DAY_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})") def __init__(self, input_files: list[str]): self.input_files = input_files def _resolve_input_files(self): resolved = [] seen = set() for item in self.input_files: path = Path(item) if path.is_dir(): for file_path in sorted(path.rglob("*.jsonl")): key = str(file_path.resolve()) if key in seen: continue seen.add(key) resolved.append(file_path) continue if any(ch in item for ch in "*?[]"): for match in sorted(glob.glob(item)): file_path = Path(match) if not file_path.is_file(): continue key = str(file_path.resolve()) if key in seen: continue seen.add(key) resolved.append(file_path) continue if path.is_file(): key = str(path.resolve()) if key in seen: continue seen.add(key) resolved.append(path) return resolved def _infer_day(self, file_path: Path): match = self.DAY_PATTERN.search(file_path.name) if match: return match.group(1) return datetime.fromtimestamp(file_path.stat().st_mtime).strftime("%Y-%m-%d") def _game_score(self, game: dict): max_turn = game["max_turn"] rows = game["rows"] avg_safe = game["avg_safe_options"] pressure_bonus = 0 if avg_safe is None else max(0.0, 4.0 - avg_safe) return round(max_turn * 2.0 + rows + pressure_bonus, 3) def _pressure_score(self, game: dict): max_turn = game["max_turn"] rows = max(1, game["rows"]) pressure_turns = game["pressure_turns"] avg_safe = game["avg_safe_options"] pressure_ratio = pressure_turns / rows safe_tightness = 0.0 if avg_safe is None else max(0.0, 3.0 - avg_safe) return round(max_turn * 1.2 + pressure_ratio * 120.0 + safe_tightness * 20.0, 3) def _extract_safe_options(self, row: dict): top_level = row.get("safe_options") if isinstance(top_level, int): return top_level history = row.get("history", {}) for item in history.get("data", []): if item.get("function") != "get_possible_moves": continue safe_positions = item.get("safe_positions", {}) if isinstance(safe_positions, dict): return len(safe_positions) return None def analyze(self): files = self._resolve_input_files() totals = { "rows": 0, "games": set(), "snake_types": Counter(), "game_types": Counter(), "moves": Counter(), "days": Counter(), } games = {} day_games = defaultdict(set) for file_path in files: day = self._infer_day(file_path) with file_path.open("r", encoding="utf-8") as source: for line in source: if not line.strip(): continue row = json.loads(line) game_id = row.get("game_id") if not game_id: continue turn = int(row.get("turn", 0)) safe_options = self._extract_safe_options(row) snake_type = row.get("snake_type", "unknown") move = row.get("move", "unknown") game_type = row.get("game_type", {}) if isinstance(game_type, dict): game_type_name = game_type.get("name", "unknown") else: game_type_name = str(game_type) totals["rows"] += 1 totals["games"].add(game_id) totals["snake_types"][snake_type] += 1 totals["game_types"][game_type_name] += 1 totals["moves"][move] += 1 totals["days"][day] += 1 if game_id not in games: games[game_id] = { "game_id": game_id, "day": day, "snake_type": snake_type, "game_type": game_type_name, "rows": 0, "max_turn": -1, "safe_options_sum": 0, "safe_options_count": 0, "pressure_turns": 0, } game = games[game_id] game["rows"] += 1 game["max_turn"] = max(game["max_turn"], turn) if isinstance(safe_options, int): game["safe_options_sum"] += safe_options game["safe_options_count"] += 1 if safe_options <= 2: game["pressure_turns"] += 1 day_games[day].add(game_id) game_summaries = [] for game in games.values(): avg_safe = None if game["safe_options_count"] > 0: avg_safe = round( game["safe_options_sum"] / game["safe_options_count"], 3 ) item = { "game_id": game["game_id"], "day": game["day"], "snake_type": game["snake_type"], "game_type": game["game_type"], "rows": game["rows"], "max_turn": game["max_turn"], "avg_safe_options": avg_safe, "pressure_turns": game["pressure_turns"], } item["score"] = self._game_score(item) item["pressure_score"] = self._pressure_score(item) game_summaries.append(item) game_summaries.sort( key=lambda x: (x["score"], x["max_turn"], x["rows"]), reverse=True ) best_overall = game_summaries[0] if game_summaries else None pressure_sorted = sorted( game_summaries, key=lambda x: (x["pressure_score"], x["max_turn"], x["rows"]), reverse=True, ) best_pressure_overall = pressure_sorted[0] if pressure_sorted else None by_day = {} for day, game_ids in sorted(day_games.items()): day_list = [item for item in game_summaries if item["game_id"] in game_ids] day_list.sort( key=lambda x: (x["score"], x["max_turn"], x["rows"]), reverse=True ) day_pressure = sorted( day_list, key=lambda x: (x["pressure_score"], x["max_turn"], x["rows"]), reverse=True, ) by_day[day] = { "rows": totals["days"][day], "games": len(game_ids), "best_game": day_list[0] if day_list else None, "best_pressure_game": day_pressure[0] if day_pressure else None, } return { "files_scanned": [str(path) for path in files], "overall": { "rows": totals["rows"], "games": len(totals["games"]), "snake_types": dict(totals["snake_types"]), "game_types": dict(totals["game_types"]), "moves": dict(totals["moves"]), "best_game": best_overall, "best_pressure_game": best_pressure_overall, }, "by_day": by_day, "top_games": game_summaries[:10], "top_pressure_games": pressure_sorted[:10], } if __name__ == "__main__": parser = argparse.ArgumentParser(description="Analyze Battlesnake JSONL datasets") parser.add_argument( "--input", action="append", required=True, help="Input JSONL file, directory, or glob pattern. Repeat for multiple inputs.", ) parser.add_argument( "--output", default=None, help="Optional path to write JSON report", ) args = parser.parse_args() report = DatasetStats(args.input).analyze() print(json.dumps(report, indent=2)) if args.output: output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(json.dumps(report, indent=2), encoding="utf-8")