diff --git a/reasoning_gym/algorithmic/game_of_life.py b/reasoning_gym/algorithmic/game_of_life.py index 7a1647e7..d75f57c0 100644 --- a/reasoning_gym/algorithmic/game_of_life.py +++ b/reasoning_gym/algorithmic/game_of_life.py @@ -32,7 +32,7 @@ class GameOfLifeDataset(ProceduralDataset): def __init__(self, config: GameOfLifeConfig): self._prompt_templates = [ - "What will this Game of Life board look like after {simulation_steps} steps of simulation? Reply as array of arrays representing rows in the grid from top to bottom in JSON format. (An empty 3x3 grid would look like this: [[0,0,0],[0,0,0],[0,0,0]])\n\n{board}." + "What will this Game of Life board look like after {simulation_steps} steps of simulation? Assume a Moore neighborhood and wrapping topology. Reply as array of arrays representing rows in the grid from top to bottom in JSON format. (An empty 3x3 grid would look like this: [[0,0,0],[0,0,0],[0,0,0]])\n\n{board}." ] super().__init__(config=config, seed=config.seed, size=config.size) @@ -105,13 +105,42 @@ class GameOfLifeDataset(ProceduralDataset): try: ans_arr = json.loads(answer) correct_arr = json.loads(entry["answer"]) - - if correct_arr != ans_arr: - return 0.01 - else: - return 1.0 # Yay - except Exception as e: + except Exception: return 0.01 + total_cells = 0 + correct_cells = 0 + + # Determine if the array is 2D (i.e. a list of lists) + is_2d = correct_arr and isinstance(correct_arr[0], list) + + if is_2d: + # Iterate over rows and columns of the expected grid. + for i, expected_row in enumerate(correct_arr): + for j, expected_value in enumerate(expected_row): + total_cells += 1 + try: + if ans_arr[i][j] == expected_value: + correct_cells += 1 + except (IndexError, TypeError): + # Either the row or the cell is missing, treat as incorrect. + pass + else: + # 1D array case. + for i, expected_value in enumerate(correct_arr): + total_cells += 1 + try: + if ans_arr[i] == expected_value: + correct_cells += 1 + except IndexError: + pass + + # If for some reason there are no cells, return 0.0. + if total_cells == 0: + return 0.0 + + # Each cell contributes equally. + return correct_cells / total_cells + register_dataset("game_of_life", GameOfLifeDataset, GameOfLifeConfig) diff --git a/reasoning_gym/algorithmic/word_sequence_reversal.py b/reasoning_gym/algorithmic/word_sequence_reversal.py index 67f97ca1..84e297aa 100644 --- a/reasoning_gym/algorithmic/word_sequence_reversal.py +++ b/reasoning_gym/algorithmic/word_sequence_reversal.py @@ -8,7 +8,7 @@ from typing import Optional from ..data import read_data_file from ..factory import ProceduralDataset, register_dataset -QUESTION_TEMPLATE = """Solve the following problem. Provide you answer as a comma-separated list of word with a space the comma. Reverse this list of words: {question}""" +QUESTION_TEMPLATE = """Solve the following problem. Provide you answer as a comma-separated list of words with a space after the comma. Reverse this list of words: {question}""" @dataclass diff --git a/reasoning_gym/graphs/quantum_lock.py b/reasoning_gym/graphs/quantum_lock.py index 4d810145..c32085f2 100644 --- a/reasoning_gym/graphs/quantum_lock.py +++ b/reasoning_gym/graphs/quantum_lock.py @@ -192,7 +192,9 @@ Buttons: # Partial credit for reaching target (optional) final_state = self.simulate_sequence(entry["metadata"], user_sequence) if final_state == entry["metadata"]["target_value"]: - return 0.5 # Alternative scoring option + if len(user_sequence) == len(target_sequence): + return 1.0 # Different answer, but qually correct + return 0.5 # Alternative scoring - you're correct, but not optimal return 0.1 diff --git a/tests/test_game_of_life.py b/tests/test_game_of_life.py index abdc17ae..924a12de 100644 --- a/tests/test_game_of_life.py +++ b/tests/test_game_of_life.py @@ -1,3 +1,5 @@ +import json + import pytest from reasoning_gym.algorithmic.game_of_life import GameOfLifeConfig, GameOfLifeDataset @@ -50,6 +52,24 @@ def test_game_of_life_basic_properties(): assert dataset.score_answer(answer=None, entry=item) == 0.0 assert dataset.score_answer(answer="invalid json", entry=item) == 0.01 + config = GameOfLifeConfig(seed=43, size=1, grid_size_x=3, grid_size_y=3, filled_cells=1, simulation_steps=1) + dataset = GameOfLifeDataset(config) + + for item in dataset: + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + ja = json.loads(item["answer"]) + ja[0][0] = 1 + ja[0][1] = 1 + ja[0][2] = 1 + jas = json.dumps(ja) + + # Test the scoring + assert 0.1 < dataset.score_answer(answer=jas, entry=item) < 1.0 + def test_game_of_life_iteration(): """Test that iteration respects dataset size"""