mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
* draft path-star task * typos * fix for paper spec * rm teacherless mode * add imports * fixes * validation tweak * test tweak
195 lines
6.8 KiB
Python
195 lines
6.8 KiB
Python
"""Tests for Path Star graph problem generation"""
|
|
|
|
import pytest
|
|
|
|
from reasoning_gym.graphs.path_star import PathStarConfig, PathStarCurriculum, PathStarDataset
|
|
|
|
|
|
def test_path_star_config_validation():
|
|
"""Test that invalid configs raise appropriate errors"""
|
|
with pytest.raises(AssertionError):
|
|
config = PathStarConfig(degree=1) # Must be >= 2
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = PathStarConfig(min_path_length=1) # Must be >= 2
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = PathStarConfig(min_path_length=5, max_path_length=3) # min > max
|
|
config.validate()
|
|
|
|
with pytest.raises(AssertionError):
|
|
config = PathStarConfig(degree=3, max_path_length=5, node_range=16) # node_range too small (need > 3*5+1=16)
|
|
config.validate()
|
|
|
|
|
|
def test_path_star_dataset_deterministic():
|
|
"""Test that dataset generates same items with same seed"""
|
|
config = PathStarConfig(seed=42, size=10)
|
|
dataset1 = PathStarDataset(config)
|
|
dataset2 = PathStarDataset(config)
|
|
|
|
for i in range(len(dataset1)):
|
|
assert dataset1[i] == dataset2[i]
|
|
|
|
|
|
def test_path_star_dataset_items():
|
|
"""Test basic properties of generated items"""
|
|
config = PathStarConfig(min_path_length=3, max_path_length=5, size=10, seed=42)
|
|
dataset = PathStarDataset(config)
|
|
|
|
for i in range(len(dataset)):
|
|
item = dataset[i]
|
|
# Check item structure
|
|
assert isinstance(item, dict)
|
|
assert "question" in item
|
|
assert "answer" in item
|
|
assert "metadata" in item
|
|
|
|
# Check metadata fields
|
|
assert item["metadata"]["source_dataset"] == "path_star"
|
|
assert item["metadata"]["source_index"] == i
|
|
assert "center" in item["metadata"]
|
|
assert "goal" in item["metadata"]
|
|
assert "path_length" in item["metadata"]
|
|
assert "goal_path" in item["metadata"]
|
|
assert "difficulty" in item["metadata"]
|
|
|
|
# Verify answer format: space-separated integers
|
|
answer_parts = item["answer"].split()
|
|
assert all(part.isdigit() for part in answer_parts)
|
|
|
|
# First node should be center, last should be goal
|
|
center = item["metadata"]["center"]
|
|
goal = item["metadata"]["goal"]
|
|
assert int(answer_parts[0]) == center
|
|
assert int(answer_parts[-1]) == goal
|
|
|
|
# Path length should match: center + path_length nodes
|
|
path_length = item["metadata"]["path_length"]
|
|
assert len(answer_parts) == path_length + 1
|
|
|
|
# Path length within configured range
|
|
assert config.min_path_length <= path_length <= config.max_path_length
|
|
|
|
|
|
def test_path_star_dataset_iteration():
|
|
"""Test that iteration respects dataset size"""
|
|
config = PathStarConfig(size=5, seed=42)
|
|
dataset = PathStarDataset(config)
|
|
|
|
items = list(dataset)
|
|
assert len(items) == config.size
|
|
|
|
# Test multiple iterations yield same items
|
|
assert items == list(dataset)
|
|
|
|
|
|
def test_path_star_answer_correctness():
|
|
"""Test that generated paths are valid by checking edge connectivity"""
|
|
config = PathStarConfig(size=20, seed=123)
|
|
dataset = PathStarDataset(config)
|
|
|
|
for i in range(len(dataset)):
|
|
item = dataset[i]
|
|
question = item["question"]
|
|
answer_parts = [int(x) for x in item["answer"].split()]
|
|
|
|
# Parse edges from the question
|
|
# Format: ...edges_str/start goal = ...
|
|
# Extract the task part between "Solve the following task:\n" and the end
|
|
task_line = question.split("Solve the following task:\n")[1].strip()
|
|
edge_part, _ = task_line.split("/")
|
|
edges = set()
|
|
for edge_str in edge_part.split("|"):
|
|
edge_str = edge_str.strip()
|
|
if edge_str:
|
|
u, v = edge_str.split()
|
|
edges.add((int(u), int(v)))
|
|
|
|
# Verify consecutive nodes in the answer are connected by edges
|
|
for j in range(len(answer_parts) - 1):
|
|
u, v = answer_parts[j], answer_parts[j + 1]
|
|
assert (u, v) in edges, f"Edge ({u}, {v}) not found in edges for item {i}"
|
|
|
|
|
|
def test_path_star_score_answer():
|
|
"""Test the score_answer method"""
|
|
config = PathStarConfig(seed=42, size=5)
|
|
dataset = PathStarDataset(config)
|
|
item = dataset[0]
|
|
oracle = item["answer"]
|
|
|
|
# Exact match
|
|
assert dataset.score_answer(oracle, item) == 1.0
|
|
|
|
# Match with extra whitespace
|
|
assert dataset.score_answer(f" {oracle} ", item) == 1.0
|
|
|
|
# Match with extra internal whitespace
|
|
spaced = oracle.replace(" ", " ")
|
|
assert dataset.score_answer(spaced, item) == 1.0
|
|
|
|
# Wrong answer
|
|
assert dataset.score_answer("0 1 2 3", item) == 0.0
|
|
|
|
# None
|
|
assert dataset.score_answer(None, item) == 0.0
|
|
|
|
# Empty string
|
|
assert dataset.score_answer("", item) == 0.0
|
|
|
|
|
|
def test_path_star_reversed():
|
|
"""Test that reversed=True produces correct answer and task format"""
|
|
config_fwd = PathStarConfig(seed=42, size=5, reversed=False)
|
|
config_rev = PathStarConfig(seed=42, size=5, reversed=True)
|
|
dataset_fwd = PathStarDataset(config_fwd)
|
|
dataset_rev = PathStarDataset(config_rev)
|
|
|
|
for i in range(len(dataset_fwd)):
|
|
item_fwd = dataset_fwd[i]
|
|
item_rev = dataset_rev[i]
|
|
|
|
# Reversed answer should be the forward answer reversed
|
|
fwd_parts = item_fwd["answer"].split()
|
|
rev_parts = item_rev["answer"].split()
|
|
assert rev_parts == list(reversed(fwd_parts))
|
|
|
|
# Task format should swap start/goal
|
|
center = item_fwd["metadata"]["center"]
|
|
goal = item_fwd["metadata"]["goal"]
|
|
assert f"/{center} {goal} = " in item_fwd["question"]
|
|
assert f"/{goal} {center} = " in item_rev["question"]
|
|
|
|
|
|
def test_path_star_curriculum():
|
|
"""Test curriculum creates valid configs at various levels"""
|
|
curriculum = PathStarCurriculum()
|
|
|
|
base_value = {"size": 150, "seed": 1}
|
|
|
|
# Level 0 (base)
|
|
base_cfg: PathStarConfig = curriculum.generate_configuration(base_value)
|
|
assert base_cfg.seed == 1
|
|
assert base_cfg.size == 150
|
|
assert base_cfg.degree == 2
|
|
assert base_cfg.node_range == 10_000
|
|
assert base_cfg.min_path_length == 3 and base_cfg.max_path_length == 3
|
|
|
|
# Increment attributes
|
|
curriculum.increment_attr_level("degree")
|
|
curriculum.increment_attr_level("node_range")
|
|
curriculum.increment_attr_level("path_length")
|
|
increased_cfg = curriculum.generate_configuration(base_value)
|
|
assert increased_cfg.degree == 3
|
|
assert increased_cfg.node_range == 50_000
|
|
assert increased_cfg.min_path_length == 3 and increased_cfg.max_path_length == 5
|
|
|
|
# Decrement degree back
|
|
curriculum.decrement_attr_level("degree")
|
|
partial_cfg = curriculum.generate_configuration(base_value)
|
|
assert partial_cfg.degree == 2
|
|
assert partial_cfg.node_range == 50_000
|
|
assert partial_cfg.min_path_length == 3 and partial_cfg.max_path_length == 5
|