reasoning-gym/tests/test_rearc.py
Zafir Stojanovski 290bfc4fdd
(evals): Medium configs (#415)
* updated medium configs

* fix problematic curriculum values / small issues causing exceptions to be raised

* optimus alpha config

* all configs so far

* fix tests
2025-04-14 08:25:31 +02:00

139 lines
5.2 KiB
Python

import pytest
from reasoning_gym.arc.board_format import format_board
from reasoning_gym.arc.rearc import ReArcConfig, ReArcCurriculum, ReArcDataset
def test_rearc_config_validation():
"""Test validation of ReArc configuration parameters"""
with pytest.raises(AssertionError):
ReArcConfig(diff_lb=0.5, diff_ub=0.3).validate()
with pytest.raises(AssertionError):
ReArcConfig(size=0).validate()
def test_rearc_deterministic():
"""Test dataset reproducibility with fixed seed"""
config = ReArcConfig(seed=42, size=100, diff_lb=0, diff_ub=1)
ds1 = ReArcDataset(config)
ds2 = ReArcDataset(config)
for i in range(len(ds1)):
assert ds1[i] == ds2[i], "ReArc datasets with same seed should match exactly"
def test_rearc_items():
"""Test basic structure and metadata of generated items"""
config = ReArcConfig(seed=42, size=100, diff_lb=0, diff_ub=1)
dataset = ReArcDataset(config)
for item in dataset:
assert isinstance(item, dict)
assert "question" in item
assert "answer" in item
assert "metadata" in item
meta = item["metadata"]
assert "input" in meta
assert "output" in meta
assert "task_id" in meta
assert "rng" in meta
assert "pso" in meta
# Validate difficulty bounds
assert config.diff_lb <= meta["rng"] <= config.diff_ub
assert config.diff_lb <= meta["pso"] <= config.diff_ub
def test_rearc_solution_validation():
"""Test solution verification and scoring"""
config = ReArcConfig(size=100, seed=123)
dataset = ReArcDataset(config)
for item in dataset:
# Test correct solution
correct = format_board(item["metadata"]["output"], dataset.board_format_opts)
assert dataset.score_answer(correct, entry=item) == 1.0
# Test invalid format
invalid_grid = """
9 9 9
1 2 1
7 8 7
0 0 0
"""
assert dataset.score_answer(invalid_grid, entry=item) == 0.05
# Test empty answer
assert dataset.score_answer(None, entry=item) == 0.0
def test_rearc_scoring_edge_cases():
"""Test scoring for partial and malformed answers"""
config = ReArcConfig(size=100, seed=456)
dataset = ReArcDataset(config)
for item in dataset:
# Partial match
partial = format_board([[0, 0], [0, 0]], dataset.board_format_opts)
assert 0.0 < dataset.score_answer(partial, entry=item) < 1.0
# Malformed answer
assert dataset.score_answer("[[invalid", entry=item) == 0.0
# Case sensitivity
answer = format_board(item["metadata"]["output"], dataset.board_format_opts).lower()
assert dataset.score_answer(answer, entry=item) == 1.0
def test_rearc_curriculum():
"""Test the ReArc curriculum functionality"""
curriculum = ReArcCurriculum()
base_value = {"size": 50, "seed": 42}
# Test default configuration
base_cfg: ReArcConfig = curriculum.generate_configuration(base_value)
assert base_cfg.seed == 42
assert base_cfg.size == 50
# Default levels should have weights that select only the easiest tasks
assert base_cfg.pso_difficulty_weights == [1, 0, 0, 0, 0, 0, 0]
assert base_cfg.rng_difficulty_weights == [1, 0, 0, 0, 0, 0, 0]
# Test incrementing pso_difficulty attribute
curriculum.increment_attr_level("pso_difficulty_weights")
pso_cfg = curriculum.generate_configuration(base_value)
assert pso_cfg.pso_difficulty_weights == [0, 1, 0, 0, 0, 0, 0] # Level 1: second difficulty range
assert pso_cfg.rng_difficulty_weights == [1, 0, 0, 0, 0, 0, 0] # RNG unchanged
# Test incrementing rng_difficulty attribute
curriculum.increment_attr_level("rng_difficulty_weights")
rng_cfg = curriculum.generate_configuration(base_value)
assert rng_cfg.pso_difficulty_weights == [0, 1, 0, 0, 0, 0, 0] # PSO unchanged
assert rng_cfg.rng_difficulty_weights == [0, 1, 0, 0, 0, 0, 0] # Level 1: second difficulty range
# Test decrementing pso_difficulty attribute
curriculum.decrement_attr_level("pso_difficulty_weights")
decr_cfg = curriculum.generate_configuration(base_value)
assert decr_cfg.pso_difficulty_weights == [1, 0, 0, 0, 0, 0, 0] # Back to level 0
assert decr_cfg.rng_difficulty_weights == [0, 1, 0, 0, 0, 0, 0] # RNG unchanged
# Test global level setting to higher level
curriculum.set_global_level(3) # Set all attributes to level 3
global_cfg = curriculum.generate_configuration(base_value)
assert global_cfg.pso_difficulty_weights == [0, 0, 0, 1, 0, 0, 0] # Level 3
assert global_cfg.rng_difficulty_weights == [0, 0, 0, 1, 0, 0, 0] # Level 3
# Test increment global level
curriculum.increment_global_level() # Should go to level 4
incr_global_cfg = curriculum.generate_configuration(base_value)
assert incr_global_cfg.pso_difficulty_weights == [0, 0, 0, 0, 1, 0, 0] # Level 4
assert incr_global_cfg.rng_difficulty_weights == [0, 0, 0, 0, 1, 0, 0] # Level 4
# Test decrement global level
curriculum.decrement_global_level() # Should go back to level 3
decr_global_cfg = curriculum.generate_configuration(base_value)
assert decr_global_cfg.pso_difficulty_weights == [0, 0, 0, 1, 0, 0, 0] # Level 3
assert decr_global_cfg.rng_difficulty_weights == [0, 0, 0, 1, 0, 0, 0] # Level 3