diff --git a/reasoning_gym/games/n_queens.py b/reasoning_gym/games/n_queens.py index 8fdec48f..7f85c0d7 100644 --- a/reasoning_gym/games/n_queens.py +++ b/reasoning_gym/games/n_queens.py @@ -14,14 +14,29 @@ from ..factory import ProceduralDataset, register_dataset MIN_BOARD_SIZE = 4 MAX_BOARD_SIZE = 12 -QUESTION_TEMPLATE = """Solve this N Queens puzzle: -{puzzle} - -The board size is {n}x{n} and your job is to place {num_removed} queen(s) on the board such that no two queens attack each other. +QUESTION_TEMPLATE = """Your job is to complete an n x n chess board with n Queens in total, such that no two attack each other. No two queens attack each other if they are not in the same row, column, or diagonal. -Place a queen by replacing an underscore (_) with a Q. +You can place a queen by replacing an underscore (_) with a Q. + +Example: +- Input: Given the below board of size 4 x 4 your job is to place 2 queen(s) on the board such that no two queens attack each other. +_ Q _ _ +_ _ _ _ +_ _ _ _ +_ _ Q _ +- Output: +_ Q _ _ +_ _ _ Q +Q _ _ _ +_ _ Q _ +- Explanation + - None of the queens attack each other vertically, horizontally, or diagonally. + - The added queens are marked with Q at the positions (1, 3) and (2, 0). + +Given the below board of size {n} x {n} your job is to place {num_removed} queen(s) on the board such that no two queens attack each other. +{puzzle} """ @@ -137,13 +152,16 @@ class NQueensDataset(ProceduralDataset): def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float: valid_solutions = entry["metadata"]["valid_answers"] - reward = 0.0 if answer is not None: if answer in valid_solutions: - reward = 1.0 - else: - reward = 0.01 - return reward + return 1.0 + try: + answer = self._board_to_string(eval(answer)) + if answer in valid_solutions: + return 0.5 + except Exception as e: + return 0.01 + return 0.0 register_dataset("n_queens", NQueensDataset, NQueensConfig) diff --git a/reasoning_gym/utils.py b/reasoning_gym/utils.py index c7d1b0d8..c59c06ca 100644 --- a/reasoning_gym/utils.py +++ b/reasoning_gym/utils.py @@ -8,12 +8,12 @@ SYSTEM_PROMPTS = { "DeepSeekZero": """A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here -Do not explain your reasoning inside the answer tags, provide only the final answer. +Do not explain your reasoning inside the answer tags, provide only the final answer. When an example is provided, you should strictly follow the format of the output/answer in that example. """, "default": """Given a problem, your task is to answer the question by thinking step-by-step in a clear and specific manner. Once you have thought about the reasoning process, provide the answer in the following format: answer here -Do not explain your reasoning inside the answer tags, provide only the final answer. +Do not explain your reasoning inside the answer tags, provide only the final answer. When an example is provided, you should strictly follow the format of the output/answer in that example. """, } diff --git a/tests/test_n_queens.py b/tests/test_n_queens.py index 16911220..91373592 100644 --- a/tests/test_n_queens.py +++ b/tests/test_n_queens.py @@ -122,6 +122,11 @@ def test_nqueens_score_answer(): # Test None answer gets score 0.0 assert dataset.score_answer(None, item) == 0.0 + # Test python list representation of board (partial solution) + answer = "[['_', 'Q', '_', '_'], ['_', '_', '_', 'Q'], ['Q', '_', '_', '_'], ['_', '_', 'Q', '_']]" + entry = {"metadata": {"valid_answers": {"_ Q _ _\n_ _ _ Q\nQ _ _ _\n_ _ Q _"}}} + assert dataset.score_answer(answer, entry) == 0.5 + def is_valid_solution(board: list[list[str]]) -> bool: """Helper function to verify N Queens solution validity"""