From c0e98f93b4e2a5cb0820c387cc93767618370fd5 Mon Sep 17 00:00:00 2001
From: Oliver Stanley <olivergestanley@gmail.com>
Date: Mon, 2 Jun 2025 07:57:15 +0100
Subject: [PATCH] make task entries json serializable (#443)

* make sympy-based task entries json serializable

* remove datetime objs from time_intervals metadata

* make adv geometry json serializable

* make futoshiki metadata json serializable

* fixes

* futoshiki tweaks

* fix adv geometry

* deal with fractions in str representations

* fix

* restore start_time, end_time as str
---
 .../algebra/intermediate_integration.py       |  1 -
 .../algebra/polynomial_multiplication.py      |  2 +-
 reasoning_gym/algebra/simple_integration.py   |  1 -
 reasoning_gym/arithmetic/time_intervals.py    |  4 +--
 reasoning_gym/games/futoshiki.py              |  4 ++-
 reasoning_gym/games/puzzle24.py               |  1 -
 reasoning_gym/geometry/advanced_geometry.py   | 31 +++++++++++--------
 tests/test_futoshiki.py                       | 10 ++++--
 tests/test_puzzle24.py                        |  1 -
 tests/test_simple_integration.py              |  1 -
 tests/test_time_intervals.py                  | 17 +++-------
 11 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/reasoning_gym/algebra/intermediate_integration.py b/reasoning_gym/algebra/intermediate_integration.py
index 78daad06..6449bd55 100644
--- a/reasoning_gym/algebra/intermediate_integration.py
+++ b/reasoning_gym/algebra/intermediate_integration.py
@@ -242,7 +242,6 @@ Use same variable symbols as given in the question
                 "integrand": str(integrand),
                 "problem_type": problem_type,
                 "variable": str(x),
-                "expected_answer_expression": answer,
                 "difficulty": {
                     "problem_type_weights": self.config.problem_type_weights,
                 },
diff --git a/reasoning_gym/algebra/polynomial_multiplication.py b/reasoning_gym/algebra/polynomial_multiplication.py
index 4a388532..f0541ac3 100644
--- a/reasoning_gym/algebra/polynomial_multiplication.py
+++ b/reasoning_gym/algebra/polynomial_multiplication.py
@@ -114,7 +114,7 @@ When performing calculations, please follow these guidelines:
                 "source_dataset": DATASET_NAME,
                 "source_index": idx,
                 "polynomial_expr": str(polynomial_expr),
-                "variables": list(product.free_symbols),
+                "variables": [str(x) for x in product.free_symbols],
                 "difficulty": {
                     "min_terms": self.config.min_terms,
                     "max_terms": self.config.max_terms,
diff --git a/reasoning_gym/algebra/simple_integration.py b/reasoning_gym/algebra/simple_integration.py
index 78823ca8..d9fdfb2c 100644
--- a/reasoning_gym/algebra/simple_integration.py
+++ b/reasoning_gym/algebra/simple_integration.py
@@ -88,7 +88,6 @@ When performing calculations, please follow these guidelines:
                 "source_index": idx,
                 "integrand": str(derivative),
                 "variable": str(symbol),
-                "expected_answer_expression": polynomial,
                 "num_terms": num_terms,
                 "difficulty": {
                     "terms": (self.config.min_terms, self.config.max_terms),
diff --git a/reasoning_gym/arithmetic/time_intervals.py b/reasoning_gym/arithmetic/time_intervals.py
index 249ff5aa..1cf7c1b0 100644
--- a/reasoning_gym/arithmetic/time_intervals.py
+++ b/reasoning_gym/arithmetic/time_intervals.py
@@ -139,8 +139,8 @@ class TimeIntervalsDataset(ProceduralDataset):
                 "source_dataset": DATASET_NAME,
                 "source_index": idx,
                 "task_type": task_type,
-                "start_time": start_dt,
-                "end_time": end_dt,
+                "start_time": str(start_dt),
+                "end_time": str(end_dt),
                 "format": format_str,
                 "expected_format": expected_format,
                 "difficulty": {
diff --git a/reasoning_gym/games/futoshiki.py b/reasoning_gym/games/futoshiki.py
index 5e377fdb..4246e8b9 100644
--- a/reasoning_gym/games/futoshiki.py
+++ b/reasoning_gym/games/futoshiki.py
@@ -79,6 +79,8 @@ class FutoshikiDataset(ProceduralDataset):
             f"Remember, in Futoshiki each row and column must contain each number from 1 to {board_size} exactly once."
         )
 
+        constraints_meta = [(r1, c1, r2, c2, sign) for ((r1, c1), (r2, c2)), sign in constraints.items()]
+
         return {
             "question": question,
             "answer": solution_str,
@@ -86,7 +88,7 @@ class FutoshikiDataset(ProceduralDataset):
                 "source_dataset": DATASET_NAME,
                 "source_index": idx,
                 "puzzle": puzzle,
-                "constraints": constraints,
+                "constraints": constraints_meta,
                 "solution": solution,
                 "board_size": board_size,
                 "difficulty_rating": difficulty,
diff --git a/reasoning_gym/games/puzzle24.py b/reasoning_gym/games/puzzle24.py
index 015873ac..4334be81 100644
--- a/reasoning_gym/games/puzzle24.py
+++ b/reasoning_gym/games/puzzle24.py
@@ -107,7 +107,6 @@ class Puzzle24Dataset(ProceduralDataset):
                 "source_dataset": DATASET_NAME,
                 "source_index": idx,
                 "numbers": numbers,
-                "expression": expr,
                 "difficulty": {"value": (self.config.min_value, self.config.max_value)},
             },
         }
diff --git a/reasoning_gym/geometry/advanced_geometry.py b/reasoning_gym/geometry/advanced_geometry.py
index 9d661a23..d45cfc4a 100644
--- a/reasoning_gym/geometry/advanced_geometry.py
+++ b/reasoning_gym/geometry/advanced_geometry.py
@@ -150,11 +150,10 @@ class AdvancedGeometryDataset(ProceduralDataset):
         question = question_template.format(A=(A.x, A.y), B=(B.x, B.y), C=(C.x, C.y), a="a", b="b")
         answer_str = f"({x_ortho_approx:.3f}, {y_ortho_approx:.3f})"
         metadata = {
-            "A": (A.x, A.y),
-            "B": (B.x, B.y),
-            "C": (C.x, C.y),
-            "ortho": (ortho.x, ortho.y),
-            "orthocenter_exact": (str(ortho.x), str(ortho.y)),
+            "A": (str(A.x), str(A.y)),
+            "B": (str(B.x), str(B.y)),
+            "C": (str(C.x), str(C.y)),
+            "ortho": (str(ortho.x), str(ortho.y)),
             "orthocenter_approx": (x_ortho_approx, y_ortho_approx),
         }
         return question, answer_str, metadata
@@ -185,9 +184,9 @@ class AdvancedGeometryDataset(ProceduralDataset):
         answer_str = f"{radius_approx:.3f}"
 
         metadata = {
-            "A": (A.x, A.y),
-            "B": (B.x, B.y),
-            "C": (C.x, C.y),
+            "A": (str(A.x), str(A.y)),
+            "B": (str(B.x), str(B.y)),
+            "C": (str(C.x), str(C.y)),
             "incircle_radius_exact": str(radius),
             "incircle_radius_approx": radius_approx,
         }
@@ -224,9 +223,9 @@ class AdvancedGeometryDataset(ProceduralDataset):
 
         answer_str = f"{angle_deg:.2f}°"
         metadata = {
-            "A": (A.x, A.y),
-            "B": (B.x, B.y),
-            "C": (C.x, C.y),
+            "A": (str(A.x), str(A.y)),
+            "B": (str(B.x), str(B.y)),
+            "C": (str(C.x), str(C.y)),
             "angle_ABC_degrees": angle_deg,
         }
         return question, answer_str, metadata
@@ -252,8 +251,14 @@ class AdvancedGeometryDataset(ProceduralDataset):
                     x_coord = float(answer.split(",")[0].replace("(", "").strip())
                     y_coord = float(answer.split(",")[1].replace(")", "").strip())
 
-                    expected_x = float(metadata["ortho"][0])
-                    expected_y = float(metadata["ortho"][1])
+                    ortho_0, ortho_1 = metadata["ortho"]
+                    if "/" in ortho_0:
+                        ortho_0 = int(ortho_0.split("/")[0]) / int(ortho_0.split("/")[1])
+                    if "/" in ortho_1:
+                        ortho_1 = int(ortho_1.split("/")[0]) / int(ortho_1.split("/")[1])
+
+                    expected_x = float(ortho_0)
+                    expected_y = float(ortho_1)
                     if x_coord == expected_x and y_coord == expected_y:
                         reward = 1.0
                     elif (np.round(x_coord, 3) == np.round(expected_x, 3)) and (
diff --git a/tests/test_futoshiki.py b/tests/test_futoshiki.py
index dd89a61c..156b2fdd 100644
--- a/tests/test_futoshiki.py
+++ b/tests/test_futoshiki.py
@@ -58,7 +58,7 @@ def test_futoshiki_items():
             assert len(row) <= config.max_board_size
         # Verify constraints format
         constraints = metadata["constraints"]
-        for ((r1, c1), (r2, c2)), rel in constraints.items():
+        for r1, c1, r2, c2, rel in constraints:
             assert 0 <= r1 < config.max_board_size
             assert 0 <= c1 < config.max_board_size
             assert 0 <= r2 < config.max_board_size
@@ -97,7 +97,9 @@ def test_futoshiki_solution_validity():
         item = dataset[i]
         metadata = item["metadata"]
         solution = metadata["solution"]
-        constraints = metadata["constraints"]
+        constraints_meta = metadata["constraints"]
+
+        constraints = {((r1, c1), (r2, c2)): rel for (r1, c1, r2, c2, rel) in constraints_meta}
 
         assert is_valid_solution(solution, config.min_board_size, constraints)
 
@@ -111,7 +113,9 @@ def test_futoshiki_puzzle_solvability():
         item = dataset[i]
         metadata = item["metadata"]
         puzzle = metadata["puzzle"]
-        constraints = metadata["constraints"]
+        constraints_meta = metadata["constraints"]
+
+        constraints = {((r1, c1), (r2, c2)): rel for (r1, c1, r2, c2, rel) in constraints_meta}
 
         # Verify puzzle has exactly one solution
         assert dataset.count_solutions(puzzle, constraints, limit=2) == 1
diff --git a/tests/test_puzzle24.py b/tests/test_puzzle24.py
index 5d8801dc..4b6d3036 100644
--- a/tests/test_puzzle24.py
+++ b/tests/test_puzzle24.py
@@ -47,7 +47,6 @@ def test_puzzle24_basic_properties():
 
         # Check metadata contains required fields
         assert "numbers" in item["metadata"]
-        assert "expression" in item["metadata"]
 
         # Check question format
         assert "Make 24 using" in item["question"]
diff --git a/tests/test_simple_integration.py b/tests/test_simple_integration.py
index 4c82a5a4..641928f4 100644
--- a/tests/test_simple_integration.py
+++ b/tests/test_simple_integration.py
@@ -64,7 +64,6 @@ def test_simple_integration_dataset_items():
 
         assert "integrand" in item["metadata"]
         assert "variable" in item["metadata"]
-        assert "expected_answer_expression" in item["metadata"]
 
         # Verify answer is a mathematical expression
         answer = item["answer"]
diff --git a/tests/test_time_intervals.py b/tests/test_time_intervals.py
index 041651b9..1ca5cfbf 100644
--- a/tests/test_time_intervals.py
+++ b/tests/test_time_intervals.py
@@ -51,8 +51,6 @@ def test_time_intervals_items():
         assert "answer" in item
         assert "metadata" in item
         assert "task_type" in item["metadata"]
-        assert "start_time" in item["metadata"]
-        assert "end_time" in item["metadata"]
 
 
 def test_time_intervals_scoring():
@@ -93,23 +91,18 @@ def test_time_intervals_scoring():
             assert 0 < score < 1
 
 
-def test_time_format_patterns():
-    """Test that generated times match expected formats"""
+def test_oracle_answer():
+    """Test that generated answer is marked correct"""
     config = TimeIntervalsConfig(seed=42, size=500)
     dataset = TimeIntervalsDataset(config)
 
     for i in range(len(dataset)):
         item = dataset[i]
 
-        start_dt = item["metadata"]["start_time"]
-        end_dt = item["metadata"]["end_time"]
+        metadata = item["metadata"]
+        assert "start_time" in metadata
+        assert "end_time" in metadata
 
-        # Verify both are datetime objects
-        assert isinstance(start_dt, datetime)
-        assert isinstance(end_dt, datetime)
-
-        # Verify end is after start
-        assert end_dt >= start_dt, item["question"]
         assert dataset.score_answer(item["answer"], item) == 1.0