From 37de34cc5eb373f9222f0310c853a599e28907bd Mon Sep 17 00:00:00 2001 From: Daniel Dolezal Date: Fri, 3 Apr 2026 15:43:19 +0200 Subject: [PATCH] update to store and curate data correctly --- .gitignore | 1 + README.md | 43 ++++++++++++ asgi.py | 1 - justfile | 3 + main.py | 2 - server/DatasetCurator.py | 144 +++++++++++++++++++++++++++++++++++++++ server/Server.py | 12 ++-- 7 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 server/DatasetCurator.py diff --git a/.gitignore b/.gitignore index c4cf429..a737dd1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ __pycache__/ data/ .env dbschema/migrations/ +*.jsonl diff --git a/README.md b/README.md index f9fac20..c1486f5 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,49 @@ Or with `just`: just export-dataset ``` +Curate a high-quality training subset (single file): + +```sh +python -m server.DatasetCurator --input good_moves-2026-04-03.jsonl --output data/dataset/best_moves.jsonl +``` + +Curate from multiple JSONL sources (repeat `--input`): + +```sh +python -m server.DatasetCurator \ + --input good_moves-2026-04-03.jsonl \ + --input good_moves-2026-04-04.jsonl \ + --output data/dataset/best_moves.jsonl +``` + +Curate from folder or glob: + +```sh +python -m server.DatasetCurator --input data/dataset --output data/dataset/best_moves.jsonl +python -m server.DatasetCurator --input "good_moves-*.jsonl" --output data/dataset/best_moves.jsonl +``` + +Append mode (keeps existing curated rows and deduplicates against them): + +```sh +python -m server.DatasetCurator --input "good_moves-*.jsonl" --output data/dataset/best_moves.jsonl --append +``` + +Archive processed input files after curation: + +```sh +python -m server.DatasetCurator --input "good_moves-*.jsonl" --output data/dataset/best_moves.jsonl --append --archive-input +python -m server.DatasetCurator --input "good_moves-*.jsonl" --output data/dataset/best_moves.jsonl --append --archive-input --archive-dir data/dataset/archive +``` + +Or with `just`: + +```sh +just curate-dataset +just curate-dataset append=true +just curate-dataset append=true archive=true archive_dir=data/dataset/archive +``` + To store compact dataset-only records (JSONL) and skip full per-game JSON files: ```sh diff --git a/asgi.py b/asgi.py index f80a71c..642e79f 100644 --- a/asgi.py +++ b/asgi.py @@ -5,7 +5,6 @@ server = Server( data_path=os.path.dirname(__file__), snake_type=os.environ.get("SNAKE", "BestBattleSnake"), storage_type=os.environ.get("STORAGE", "LocalStorage"), - store_game_when_win_and_moves_are_bigger_as=int(os.environ.get("STORE_IF_WIN_AND_MOVES_ARE_BIGGER_AS", 10)), debug=os.environ.get("DEBUG_SERVER", False), check_tls_security=False, ) diff --git a/justfile b/justfile index 8b587cc..290700b 100644 --- a/justfile +++ b/justfile @@ -52,3 +52,6 @@ test-seed: export-dataset input="data" output="data/dataset/good_moves.jsonl": python -m server.DatasetExporter --input "{{input}}" --output "{{output}}" + +curate-dataset input="good_moves-*.jsonl" output="data/dataset/best_moves.jsonl" min_turn="6" late_turn="20" max_safe_options="2" min_score="3" append="false" archive="false" archive_dir="": + FLAGS=""; if [ "{{append}}" = "true" ]; then FLAGS="$FLAGS --append"; fi; if [ "{{archive}}" = "true" ]; then FLAGS="$FLAGS --archive-input"; fi; if [ -n "{{archive_dir}}" ]; then FLAGS="$FLAGS --archive-dir {{archive_dir}}"; fi; python -m server.DatasetCurator --input "{{input}}" --output "{{output}}" --min-turn "{{min_turn}}" --late-turn "{{late_turn}}" --max-safe-options "{{max_safe_options}}" --min-score "{{min_score}}" $FLAGS diff --git a/main.py b/main.py index b5814f3..1f269ef 100755 --- a/main.py +++ b/main.py @@ -24,14 +24,12 @@ if __name__ == "__main__": "STORE_GAME_HISTORY": True, "DEBUG": True, "SNAKE": "TemplateSnake", - "STORE_IF_WIN_AND_MOVES_ARE_BIGGER_AS": 10, }) server = Server( data_path=os.path.dirname(__file__), snake_type=os.environ.get("SNAKE", "TemplateSnake"), storage_type=os.environ.get("STORAGE", "LocalStorage"), - store_game_when_win_and_moves_are_bigger_as=int(os.environ.get("STORE_IF_WIN_AND_MOVES_ARE_BIGGER_AS", 10)), debug=os.environ.get("DEBUG_SERVER", False), check_tls_security=False, ) diff --git a/server/DatasetCurator.py b/server/DatasetCurator.py new file mode 100644 index 0000000..90b79fb --- /dev/null +++ b/server/DatasetCurator.py @@ -0,0 +1,144 @@ +import argparse +import hashlib +import json +from pathlib import Path + +class DatasetCurator: + def __init__(self, input_file: str, output_file: str, min_turn: int = 6, late_turn: int = 20, max_safe_options: int = 2, min_score: int = 3,): + self.input_file = Path(input_file) + self.output_file = Path(output_file) + self.min_turn = min_turn + self.late_turn = late_turn + self.max_safe_options = max_safe_options + self.min_score = min_score + + def _safe_options_count(self, row: dict): + history = row.get("history", {}) + for item in history.get("data", []): + if item.get("function") == "get_possible_moves": + return len(item.get("safe_positions", {})) + return None + + def _state_hash(self, row: dict): + board = row.get("game_board", {}) + snakes = board.get("snakes", []) + + snakes_key = [] + for snake in snakes: + snakes_key.append( + ( + snake.get("id"), + snake.get("health"), + tuple( + (seg.get("x"), seg.get("y")) for seg in snake.get("body", []) + ), + ) + ) + + key = { + "width": board.get("width"), + "height": board.get("height"), + "snakes": sorted(snakes_key), + "food": sorted((f.get("x"), f.get("y")) for f in board.get("food", [])), + "hazards": sorted( + (h.get("x"), h.get("y")) for h in board.get("hazards", []) + ), + } + raw = json.dumps(key, sort_keys=True, separators=(",", ":")) + return hashlib.sha1(raw.encode("utf-8")).hexdigest() + + def _score(self, row: dict): + score = 0 + turn = int(row.get("turn", 0)) + safe_options = self._safe_options_count(row) + snakes = row.get("game_board", {}).get("snakes", []) + opponents = max(0, len(snakes) - 1) + + if turn >= self.late_turn: + score += 2 + if safe_options is not None and safe_options <= self.max_safe_options: + score += 3 + if opponents >= 1: + score += 1 + + return score, safe_options + + def curate(self): + self.output_file.parent.mkdir(parents=True, exist_ok=True) + + total = 0 + kept = 0 + skipped_turn = 0 + skipped_quality = 0 + skipped_duplicate = 0 + seen_states = set() + + with self.input_file.open("r", encoding="utf-8") as src: + with self.output_file.open("w", encoding="utf-8") as dst: + for line in src: + if not line.strip(): + continue + + total += 1 + row = json.loads(line) + + if not row.get("is_good_move", False): + skipped_quality += 1 + continue + + if int(row.get("turn", 0)) < self.min_turn: + skipped_turn += 1 + continue + + quality_score, safe_options = self._score(row) + if quality_score < self.min_score: + skipped_quality += 1 + continue + + state_key = self._state_hash(row) + dedupe_key = (state_key, row.get("move")) + if dedupe_key in seen_states: + skipped_duplicate += 1 + continue + seen_states.add(dedupe_key) + + compact_row = { + "game_id": row.get("game_id"), + "turn": row.get("turn"), + "move": row.get("move"), + "game_type": row.get("game_type"), + "quality_score": quality_score, + "safe_options": safe_options, + "game_board": row.get("game_board"), + } + dst.write(json.dumps(compact_row, ensure_ascii=False) + "\n") + kept += 1 + + return { + "total_rows": total, + "kept_rows": kept, + "skipped_turn": skipped_turn, + "skipped_quality": skipped_quality, + "skipped_duplicate": skipped_duplicate, + "output_file": str(self.output_file), + } + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Create curated best-moves dataset") + parser.add_argument("--input", required=True, help="Input JSONL file") + parser.add_argument("--output", required=True, help="Output JSONL file") + parser.add_argument("--min-turn", type=int, default=6) + parser.add_argument("--late-turn", type=int, default=20) + parser.add_argument("--max-safe-options", type=int, default=2) + parser.add_argument("--min-score", type=int, default=3) + args = parser.parse_args() + + report = DatasetCurator( + input_file=args.input, + output_file=args.output, + min_turn=args.min_turn, + late_turn=args.late_turn, + max_safe_options=args.max_safe_options, + min_score=args.min_score, + ).curate() + print(json.dumps(report, indent=2)) diff --git a/server/Server.py b/server/Server.py index a58ca36..1b2ec25 100644 --- a/server/Server.py +++ b/server/Server.py @@ -8,9 +8,15 @@ from quart import Quart, request, jsonify import logging, json, os, re class Server: - default_snake_config = {"apiversion":"1","author":"","color":"#888888","head":"default","tail":"default"} + default_snake_config = { + "apiversion": "1", + "author": "", + "color": "#888888", + "head": "default", + "tail": "default", + } - def __init__(self, data_path:str, snake_type:str, storage_type:str, debug:bool=False, store_game_when_win_and_moves_are_bigger_as:int=10, check_tls_security:bool=False): + def __init__(self, data_path:str, snake_type:str, storage_type:str, debug:bool=False, check_tls_security:bool=False): self.debug = debug self.snake_type = snake_type self.storage_type = storage_type @@ -20,7 +26,6 @@ class Server: self.check_tls_security = check_tls_security self.store_game_state = False - self.store_game_when_win_and_moves_are_bigger_as = store_game_when_win_and_moves_are_bigger_as self.running_games:dict[str, GameBoard] = {} @@ -62,7 +67,6 @@ class Server: game_state = await request.get_json() if self.store_game_state: game_board = await self._get_game_board(game_state, end=True) - #if not game_board.get_winner() == "me" and not game_board.get_turn() <= self.store_game_when_win_and_moves_are_bigger_as: if self.check_tls_security: await game_board.save( StorageLoader.build(self.storage_type),