rework dataset function and class structure

This commit is contained in:
2026-04-05 02:21:15 +02:00
parent 066a93f755
commit 332e86e3cc
9 changed files with 318 additions and 248 deletions
+4 -5
View File
@@ -5,6 +5,7 @@ from pathlib import Path
import tempfile, gzip
from server.dataset.RLBootstrapDataset import RLBootstrapDataset
from server.dataset.DatasetIO import DatasetIO
class TestRLBootstrapDataset(unittest.TestCase):
def test_count_jsonl_rows_reads_gzip_dataset(self):
@@ -15,17 +16,15 @@ class TestRLBootstrapDataset(unittest.TestCase):
handle.write("\n")
handle.write('{"turn":2}\n')
self.assertEqual(RLBootstrapDataset.count_jsonl_rows(dataset_path), 2)
self.assertEqual(RLBootstrapDataset.count_jsonl_rows(Path(tmp) / "base.jsonl"), 2)
self.assertEqual(DatasetIO.count_jsonl_rows(dataset_path), 2)
self.assertEqual(DatasetIO.count_jsonl_rows(Path(tmp) / "base.jsonl"), 2)
def test_rotate_and_gzip_if_size_reached_rotates_jsonl(self):
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "rl_bootstrap.jsonl"
path.write_text("x" * 200, encoding="utf-8")
rotated = RLBootstrapDataset.rotate_and_gzip_if_size_reached(
path, max_bytes=50
)
rotated = DatasetIO.rotate_and_gzip_if_size_reached(path, max_bytes=50)
self.assertTrue(rotated)
self.assertFalse(path.exists())