diff --git a/reasoning_gym/games/rush_hour.py b/reasoning_gym/games/rush_hour.py index 98f0a5ad..9bf9efaa 100644 --- a/reasoning_gym/games/rush_hour.py +++ b/reasoning_gym/games/rush_hour.py @@ -156,6 +156,33 @@ class RushHourDataset(ProceduralDataset): }, } + def score_answer(self, answer: Optional[str], entry: dict) -> float: + """Score a Rush Hour solution by simulating the moves. + + Args: + answer: String of moves in format "F+1 K+1 M-1 C+3 H+2 ..." + entry: The problem entry containing board configuration + + Returns: + 1.0 if solution reaches goal state, 0.0 otherwise + """ + if not answer: + return 0.0 + + try: + # Create board from config + board = Board(entry["metadata"]["board_config"]) + + # Perform the moves + board.perform_moves(answer) + + # Check if solved + return 1.0 if board.solved else 0.0 + + except (ValueError, IndexError, AttributeError) as e: + # Handle malformed input gracefully + return 0.0 + class Board: def __init__(self, desc: str): diff --git a/tests/test_rush_hour.py b/tests/test_rush_hour.py index 5123ebc5..bd439667 100644 --- a/tests/test_rush_hour.py +++ b/tests/test_rush_hour.py @@ -62,6 +62,24 @@ def test_rush_hour_move_filtering(): assert 5 <= moves <= 10, f"Puzzle with {moves} moves outside configured range 5-10" +def test_score_answer(): + """Test that score_answer correctly validates solutions""" + config = RushHourConfig(min_moves=1, max_moves=50, size=10, seed=42) + dataset = RushHourDataset(config) + + # Get a puzzle + puzzle = dataset[0] + + # Test invalid answers + assert dataset.score_answer(None, puzzle) == 0.0 + assert dataset.score_answer("", puzzle) == 0.0 + assert dataset.score_answer("invalid", puzzle) == 0.0 + assert dataset.score_answer("A+1 B-2 INVALID", puzzle) == 0.0 + + # Test incomplete solution + assert dataset.score_answer("A+1 B-2", puzzle) == 0.0 + + def test_perform_moves(): b = Board("GBBoLoGHIoLMGHIAAMCCCKoMooJKDDEEJFFo") assert not b.solved