From 8e733dfe39134d510eb30e454c3afccf20776aea Mon Sep 17 00:00:00 2001 From: Daniel Dolezal Date: Fri, 3 Apr 2026 15:45:54 +0200 Subject: [PATCH] add script to analyse dataset --- README.md | 18 +++ justfile | 3 + server/DatasetStats.py | 247 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 268 insertions(+) create mode 100644 server/DatasetStats.py diff --git a/README.md b/README.md index c1486f5..88c954e 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,24 @@ just curate-dataset append=true just curate-dataset append=true archive=true archive_dir=data/dataset/archive ``` +Analyze dataset quality overall and by day (best game overall/day included): + +```sh +python -m server.DatasetStats --input "good_moves-*.jsonl" +python -m server.DatasetStats --input data/dataset --output data/dataset/stats-report.json +``` + +The stats report now includes both: +- `best_game` (survival/length focused) +- `best_pressure_game` (high-pressure quality focused: fewer safe options + strong survival) + +Or with `just`: + +```sh +just analyze-dataset +just analyze-dataset input=data/dataset output=data/dataset/stats-report.json +``` + To store compact dataset-only records (JSONL) and skip full per-game JSON files: ```sh diff --git a/justfile b/justfile index 290700b..7d26b27 100644 --- a/justfile +++ b/justfile @@ -55,3 +55,6 @@ export-dataset input="data" output="data/dataset/good_moves.jsonl": curate-dataset input="good_moves-*.jsonl" output="data/dataset/best_moves.jsonl" min_turn="6" late_turn="20" max_safe_options="2" min_score="3" append="false" archive="false" archive_dir="": FLAGS=""; if [ "{{append}}" = "true" ]; then FLAGS="$FLAGS --append"; fi; if [ "{{archive}}" = "true" ]; then FLAGS="$FLAGS --archive-input"; fi; if [ -n "{{archive_dir}}" ]; then FLAGS="$FLAGS --archive-dir {{archive_dir}}"; fi; python -m server.DatasetCurator --input "{{input}}" --output "{{output}}" --min-turn "{{min_turn}}" --late-turn "{{late_turn}}" --max-safe-options "{{max_safe_options}}" --min-score "{{min_score}}" $FLAGS + +analyze-dataset input="good_moves-*.jsonl" output="": + if [ -n "{{output}}" ]; then python -m server.DatasetStats --input "{{input}}" --output "{{output}}"; else python -m server.DatasetStats --input "{{input}}"; fi diff --git a/server/DatasetStats.py b/server/DatasetStats.py new file mode 100644 index 0000000..6b8b709 --- /dev/null +++ b/server/DatasetStats.py @@ -0,0 +1,247 @@ +import argparse +import glob +import json +import re +from collections import Counter, defaultdict +from datetime import datetime +from pathlib import Path + + +class DatasetStats: + DAY_PATTERN = re.compile(r"(\d{4}-\d{2}-\d{2})") + + def __init__(self, input_files: list[str]): + self.input_files = input_files + + def _resolve_input_files(self): + resolved = [] + seen = set() + + for item in self.input_files: + path = Path(item) + if path.is_dir(): + for file_path in sorted(path.rglob("*.jsonl")): + key = str(file_path.resolve()) + if key in seen: + continue + seen.add(key) + resolved.append(file_path) + continue + + if any(ch in item for ch in "*?[]"): + for match in sorted(glob.glob(item)): + file_path = Path(match) + if not file_path.is_file(): + continue + key = str(file_path.resolve()) + if key in seen: + continue + seen.add(key) + resolved.append(file_path) + continue + + if path.is_file(): + key = str(path.resolve()) + if key in seen: + continue + seen.add(key) + resolved.append(path) + + return resolved + + def _infer_day(self, file_path: Path): + match = self.DAY_PATTERN.search(file_path.name) + if match: + return match.group(1) + return datetime.fromtimestamp(file_path.stat().st_mtime).strftime("%Y-%m-%d") + + def _game_score(self, game: dict): + max_turn = game["max_turn"] + rows = game["rows"] + avg_safe = game["avg_safe_options"] + pressure_bonus = 0 if avg_safe is None else max(0.0, 4.0 - avg_safe) + return round(max_turn * 2.0 + rows + pressure_bonus, 3) + + def _pressure_score(self, game: dict): + max_turn = game["max_turn"] + rows = max(1, game["rows"]) + pressure_turns = game["pressure_turns"] + avg_safe = game["avg_safe_options"] + + pressure_ratio = pressure_turns / rows + safe_tightness = 0.0 if avg_safe is None else max(0.0, 3.0 - avg_safe) + return round(max_turn * 1.2 + pressure_ratio * 120.0 + safe_tightness * 20.0, 3) + + def _extract_safe_options(self, row: dict): + top_level = row.get("safe_options") + if isinstance(top_level, int): + return top_level + + history = row.get("history", {}) + for item in history.get("data", []): + if item.get("function") != "get_possible_moves": + continue + safe_positions = item.get("safe_positions", {}) + if isinstance(safe_positions, dict): + return len(safe_positions) + return None + + def analyze(self): + files = self._resolve_input_files() + + totals = { + "rows": 0, + "games": set(), + "snake_types": Counter(), + "game_types": Counter(), + "moves": Counter(), + "days": Counter(), + } + + games = {} + day_games = defaultdict(set) + + for file_path in files: + day = self._infer_day(file_path) + with file_path.open("r", encoding="utf-8") as source: + for line in source: + if not line.strip(): + continue + + row = json.loads(line) + game_id = row.get("game_id") + if not game_id: + continue + + turn = int(row.get("turn", 0)) + safe_options = self._extract_safe_options(row) + snake_type = row.get("snake_type", "unknown") + move = row.get("move", "unknown") + + game_type = row.get("game_type", {}) + if isinstance(game_type, dict): + game_type_name = game_type.get("name", "unknown") + else: + game_type_name = str(game_type) + + totals["rows"] += 1 + totals["games"].add(game_id) + totals["snake_types"][snake_type] += 1 + totals["game_types"][game_type_name] += 1 + totals["moves"][move] += 1 + totals["days"][day] += 1 + + if game_id not in games: + games[game_id] = { + "game_id": game_id, + "day": day, + "snake_type": snake_type, + "game_type": game_type_name, + "rows": 0, + "max_turn": -1, + "safe_options_sum": 0, + "safe_options_count": 0, + "pressure_turns": 0, + } + + game = games[game_id] + game["rows"] += 1 + game["max_turn"] = max(game["max_turn"], turn) + if isinstance(safe_options, int): + game["safe_options_sum"] += safe_options + game["safe_options_count"] += 1 + if safe_options <= 2: + game["pressure_turns"] += 1 + + day_games[day].add(game_id) + + game_summaries = [] + for game in games.values(): + avg_safe = None + if game["safe_options_count"] > 0: + avg_safe = round( + game["safe_options_sum"] / game["safe_options_count"], 3 + ) + item = { + "game_id": game["game_id"], + "day": game["day"], + "snake_type": game["snake_type"], + "game_type": game["game_type"], + "rows": game["rows"], + "max_turn": game["max_turn"], + "avg_safe_options": avg_safe, + "pressure_turns": game["pressure_turns"], + } + item["score"] = self._game_score(item) + item["pressure_score"] = self._pressure_score(item) + game_summaries.append(item) + + game_summaries.sort( + key=lambda x: (x["score"], x["max_turn"], x["rows"]), reverse=True + ) + + best_overall = game_summaries[0] if game_summaries else None + pressure_sorted = sorted( + game_summaries, + key=lambda x: (x["pressure_score"], x["max_turn"], x["rows"]), + reverse=True, + ) + best_pressure_overall = pressure_sorted[0] if pressure_sorted else None + + by_day = {} + for day, game_ids in sorted(day_games.items()): + day_list = [item for item in game_summaries if item["game_id"] in game_ids] + day_list.sort( + key=lambda x: (x["score"], x["max_turn"], x["rows"]), reverse=True + ) + day_pressure = sorted( + day_list, + key=lambda x: (x["pressure_score"], x["max_turn"], x["rows"]), + reverse=True, + ) + + by_day[day] = { + "rows": totals["days"][day], + "games": len(game_ids), + "best_game": day_list[0] if day_list else None, + "best_pressure_game": day_pressure[0] if day_pressure else None, + } + + return { + "files_scanned": [str(path) for path in files], + "overall": { + "rows": totals["rows"], + "games": len(totals["games"]), + "snake_types": dict(totals["snake_types"]), + "game_types": dict(totals["game_types"]), + "moves": dict(totals["moves"]), + "best_game": best_overall, + "best_pressure_game": best_pressure_overall, + }, + "by_day": by_day, + "top_games": game_summaries[:10], + "top_pressure_games": pressure_sorted[:10], + } + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Analyze Battlesnake JSONL datasets") + parser.add_argument( + "--input", + action="append", + required=True, + help="Input JSONL file, directory, or glob pattern. Repeat for multiple inputs.", + ) + parser.add_argument( + "--output", + default=None, + help="Optional path to write JSON report", + ) + args = parser.parse_args() + + report = DatasetStats(args.input).analyze() + print(json.dumps(report, indent=2)) + + if args.output: + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(report, indent=2), encoding="utf-8")