From bd8fc9beeb6b9af5f3f60b42e28078c6c2dd3bb2 Mon Sep 17 00:00:00 2001 From: Rich Jones Date: Fri, 7 Feb 2025 15:09:42 +0100 Subject: [PATCH] add self-reference puzzles --- README.md | 3 +- reasoning_gym/logic/__init__.py | 3 + reasoning_gym/logic/self_reference.py | 374 ++++++++++++++++++++++++++ tests/test_self_reference.py | 55 ++++ tests/test_tsumego.py | 3 +- 5 files changed, 436 insertions(+), 2 deletions(-) create mode 100644 reasoning_gym/logic/self_reference.py create mode 100644 tests/test_self_reference.py diff --git a/README.md b/README.md index c329d21d..f9fb86a2 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets - `SimpleEquationsDataset`: Generate linear equations with one variable to solve (e.g. "3\*x + 2 = 14") - `PolynomialEquationsDataset`: Generate polynomial equations with one variable to solve (e.g. "-6*h\*\*4 + 4*h\**2 - 5*h = 0") -- `PolynomialMultiplicationDataset`: Generate polynomial multiplicatons (e.g. "(8x^3 + x + 2)*(y - 3)") +- `PolynomialMultiplicationDataset`: Generate polynomial multiplicatons (e.g. "(8x^3 + x + 2)\*(y - 3)") ### Arithmetic Tasks @@ -118,6 +118,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets - `SyllogismDataset`: Generates a [syllogism](https://en.wikipedia.org/wiki/Syllogism) reasoning dataset - `AliceInWonderlandDataset`: Generates [AIW](https://openreview.net/forum?id=Mkl7dzjYiW) (Alice In Wonderland) problems with a few variations - `ZebraDataset`: Generates [Zebra Puzzles](https://en.wikipedia.org/wiki/Zebra_Puzzle) of varying difficulty. +- `SelfReferenceDataset`: Generates self-referencing logic puzzles. ### Graph Tasks diff --git a/reasoning_gym/logic/__init__.py b/reasoning_gym/logic/__init__.py index dfa1c7ad..c05c4dba 100644 --- a/reasoning_gym/logic/__init__.py +++ b/reasoning_gym/logic/__init__.py @@ -4,6 +4,7 @@ Logic tasks for training reasoning capabilities. from .aiw import AliceInWonderlandConfig, AliceInWonderlandDataset from .propositional_logic import PropositionalLogicConfig, PropositionalLogicDataset +from .self_reference import SelfReferenceConfig, SelfReferenceDataset from .syllogisms import SyllogismConfig, SyllogismDataset, Term from .zebra_puzzles import ZebraConfig, ZebraDataset @@ -18,4 +19,6 @@ __all__ = [ "Term", "ZebraConfig", "ZebraDataset", + "SelfReference", + "SelfReferenceDataset", ] diff --git a/reasoning_gym/logic/self_reference.py b/reasoning_gym/logic/self_reference.py new file mode 100644 index 00000000..045fc22e --- /dev/null +++ b/reasoning_gym/logic/self_reference.py @@ -0,0 +1,374 @@ +from dataclasses import dataclass +from random import Random +from typing import Dict, Optional + +from ..factory import ProceduralDataset, register_dataset +from .contrib.logic_puzzle.generate import generate_puzzle + + +def is_prime(n): + """Return True if n is a prime number, False otherwise.""" + if n < 2: + return False + for i in range(2, int(n**0.5) + 1): + if n % i == 0: + return False + return True + + +def is_composite(n): + """ + Return True if n is composite. + (Composite means an integer greater than 1 that is not prime.) + """ + return n > 1 and not is_prime(n) + + +def generate_dynamic_puzzle(difficulty, rng): + """ + Dynamically generates a 7-statement self-referential puzzle. + + The seven statements (with parameters determined by this function) are: + + 1. "At least a of these 7 statements are true." + 2. "At most b of these 7 statements are false." + 3. "Exactly c of these 7 statements are true." + 4. "Exactly d of these 7 statements are false." + 5. "Either Statement 3 or Statement 4 is true, but not both." + 6. "The number of true statements is a prime number." + 7. "The number of false statements is a composite number." + + The idea is to choose an intended number T (1 ≤ T ≤ 6) of true statements + and then “plant” an intended solution. In our construction the truth values + for Statements 6 and 7 are forced by T (e.g. Statement 6 should be true exactly + when T is prime). For the first four statements the numeric parameters (a, b, c, d) + are chosen so that the statement evaluates correctly when compared to T. + + The difficulty parameter (an integer, e.g. 1 for easy up to 10 for hard) + influences how “borderline” the numeric choices are. At lower difficulty the numbers + are chosen with a clear gap; at higher difficulty they are chosen closer to T. + + Returns: + dict: A puzzle dictionary containing: + - 'n': number of statements (always 7 here), + - 'statements_text': a list of 7 strings (one per statement), + - 'parameters': a dict with the numeric parameters (for statements 1-4), + - 'intended_assignment': the intended truth values (list of 7 booleans), + - 'intended_T': the intended number of true statements. + """ + n = 7 + + # Choose an intended number of true statements, T, from 1 to 6 (nontrivial). + T = rng.choice(range(1, n)) + + # For the global statements (6 and 7), the intended truth is forced: + intended6 = is_prime(T) # Statement 6 must be true if T is prime. + intended7 = is_composite(n - T) # Statement 7 must be true if (# false) is composite. + + # Among statements 1-5, we need exactly k trues such that overall the total becomes T. + # Let k = T - (truth from statements 6 and 7). + forced_true_count = (1 if intended6 else 0) + (1 if intended7 else 0) + k = T - forced_true_count + # k must be between 0 and 5. + if not (0 <= k <= 5): + # If for some reason it is not in range, fall back to a known configuration (T=4). + T = 4 + intended6 = False + intended7 = False + k = 4 # so that overall T=4. + intended_assignment_15 = [True, True, True, True, False] + else: + # For statements 1-5, randomly choose which ones are intended true. + # We'll index these as 0..4 corresponding to statements 1..5. + intended_assignment_15 = [False] * 5 + if k > 0: + true_indices = set(rng.sample(range(5), k)) + for i in true_indices: + intended_assignment_15[i] = True + + # Now, for statements 1-4, choose numeric parameters based on whether the statement is + # intended to be true or false. We use the difficulty parameter to control the "margin." + # + # For statement 1: "At least a of these 7 statements are true." + # The condition is: T >= a. + def choose_at_least_param(T, intended, diff, rng): + # diff will be used as a margin factor: lower diff => wider gap. + if intended: # must have a <= T. + # At easy difficulty, choose a clearly below T (if possible). + low = 1 + high = T + # At lower difficulty, bias toward the lower end. + return rng.randint(low, high) + else: # must have a > T. + low = T + 1 + high = n # a can be at most n. + if low > high: + return n + return rng.randint(low, high) + + a_param = choose_at_least_param(T, intended_assignment_15[0], difficulty, rng) + + # For statement 2: "At most b of these 7 statements are false." + # F = n - T, so condition is: (n - T) <= b <=> T >= n - b. + def choose_at_most_param(T, intended, diff, rng): + if intended: # b must be >= n - T. + low = n - T + high = n + return rng.randint(low, high) + else: + # b must be < n - T. + low = 0 + high = max(n - T - 1, 0) + return rng.randint(low, high) + + b_param = choose_at_most_param(T, intended_assignment_15[1], difficulty, rng) + + # For statement 3: "Exactly c of these 7 statements are true." + def choose_exactly_true_param(T, intended, diff, rng): + if intended: + return T + else: + choices = [x for x in range(0, n + 1) if x != T] + return rng.choice(choices) + + c_param = choose_exactly_true_param(T, intended_assignment_15[2], difficulty, rng) + + # For statement 4: "Exactly d of these 7 statements are false." + # Condition: (n - T) == d. + def choose_exactly_false_param(T, intended, diff, rng): + false_count = n - T + if intended: + return false_count + else: + choices = [x for x in range(0, n + 1) if x != false_count] + return rng.choice(choices) + + d_param = choose_exactly_false_param(T, intended_assignment_15[3], difficulty, rng) + + # For statement 5: "Either Statement 3 or Statement 4 is true, but not both." + # We do not need a parameter here; the intended condition is that the truth values for + # statements 3 and 4 (which are positions 2 and 3 in our 0-indexed list) differ. + # The intended truth for statement 5 is taken from our assignment. + # (Later the verification function will check: solution[2] != solution[3].) + + # Build the intended assignment for all 7 statements. + # For statements 1-5, we use our generated intended_assignment_15. + intended_assignment = [ + intended_assignment_15[0], + intended_assignment_15[1], + intended_assignment_15[2], + intended_assignment_15[3], + intended_assignment_15[4], + intended6, + intended7, + ] + + # (If the total intended true count doesn't equal T, adjust statement 5.) + current_T = sum(intended_assignment) + if current_T != T: + # Since only statement 5 is free (its parameter wasn't numeric), + # force its intended truth to be what is needed. + intended_assignment[4] = T - (current_T - (1 if intended_assignment[4] else 0)) == 1 + + # Now build the text for each statement. + statements_text = [ + f"Statement 1: 'At least {a_param} of these 7 statements are true.'", + f"Statement 2: 'At most {b_param} of these 7 statements are false.'", + f"Statement 3: 'Exactly {c_param} of these 7 statements are true.'", + f"Statement 4: 'Exactly {d_param} of these 7 statements are false.'", + "Statement 5: 'Either Statement 3 or Statement 4 is true, but not both.'", + "Statement 6: 'The number of true statements is a prime number.'", + "Statement 7: 'The number of false statements is a composite number.'", + ] + + return { + "n": n, + "statements_text": statements_text, + "parameters": { + "a": a_param, + "b": b_param, + "c": c_param, + "d": d_param, + }, + "intended_assignment": intended_assignment, + "intended_T": T, + "difficulty": difficulty, + } + + +def verify_solution_dynamic(puzzle, solution): + """ + Verifies a candidate solution for a dynamically generated puzzle. + + The rules are: + - If a statement is marked True, then its claim must hold. + - If a statement is marked False, then its claim must fail. + + The conditions are as follows: + 1. "At least a of these 7 statements are true." => (T >= a) + 2. "At most b of these 7 statements are false." => (F <= b) + 3. "Exactly c of these 7 statements are true." => (T == c) + 4. "Exactly d of these 7 statements are false." => (F == d) + 5. "Either Statement 3 or Statement 4 is true, but not both." => (solution[2] != solution[3]) + 6. "The number of true statements is a prime number." => is_prime(T) + 7. "The number of false statements is a composite number." => is_composite(F) + + Parameters: + puzzle (dict): The puzzle dictionary returned by generate_dynamic_puzzle. + solution (list of bool): A candidate assignment (length 7). + + Returns: + bool: True if candidate is self-consistent; False otherwise. + """ + n = puzzle["n"] + if len(solution) != n: + return False + T = sum(solution) + F = n - T + params = puzzle["parameters"] + + # Statement 1: "At least a of these 7 statements are true." + cond1 = T >= params["a"] + if solution[0] and not cond1: + return False + if not solution[0] and cond1: + return False + + # Statement 2: "At most b of these 7 statements are false." + cond2 = F <= params["b"] + if solution[1] and not cond2: + return False + if not solution[1] and cond2: + return False + + # Statement 3: "Exactly c of these 7 statements are true." + cond3 = T == params["c"] + if solution[2] and not cond3: + return False + if not solution[2] and cond3: + return False + + # Statement 4: "Exactly d of these 7 statements are false." + cond4 = F == params["d"] + if solution[3] and not cond4: + return False + if not solution[3] and cond4: + return False + + # Statement 5: "Either Statement 3 or Statement 4 is true, but not both." + cond5 = solution[2] != solution[3] + if solution[4] and not cond5: + return False + if not solution[4] and cond5: + return False + + # Statement 6: "The number of true statements is a prime number." + cond6 = is_prime(T) + if solution[5] and not cond6: + return False + if not solution[5] and cond6: + return False + + # Statement 7: "The number of false statements is a composite number." + cond7 = is_composite(F) + if solution[6] and not cond7: + return False + if not solution[6] and cond7: + return False + + return True + + +def print_puzzle_dynamic(puzzle): + """Prints the dynamically generated puzzle.""" + x = "" + for stmt in puzzle["statements_text"]: + x = x + " - " + stmt + "\n" + return x + + +def solve_puzzle_dynamic(puzzle): + """ + Searches all 2^7 possible truth assignments and returns those that + are self-consistent with the generated puzzle. + """ + n = puzzle["n"] + valid_solutions = [] + for i in range(2**n): + candidate = [(i >> j) & 1 == 1 for j in range(n)] + if verify_solution_dynamic(puzzle, candidate): + valid_solutions.append(candidate) + return valid_solutions + + +@dataclass +class SelfReferenceConfig: + """Configuration for SelfReference puzzle generation""" + + difficulty: int = 5 + seed: Optional[int] = None + size: int = 500 + + def validate(self): + """Validate configuration parameters""" + assert 1 <= self.difficulty <= 10, "difficulty must be between 1 and 10" + + +class SelfReferenceDataset(ProceduralDataset): + """Generates self-referential puzzles""" + + def __init__(self, config: SelfReferenceConfig): + super().__init__(config=config, seed=config.seed, size=config.size) + + def __getitem__(self, idx: int) -> dict: + """Generate a single SelfReference task + + Returns: + dict with keys: + - question: str, the task description + - answer: str, a solution string + - metadata: dict with generation parameters + """ + rng = Random(self.seed + idx) + + # Generate puzzle + puzzle = generate_dynamic_puzzle(self.config.difficulty, rng) + puzz_s = ( + "Given the truthfulness of these statements, please tell me the number of possible solutions: \n" + + print_puzzle_dynamic(puzzle) + ) + + # Solve puzzle + solutions = solve_puzzle_dynamic(puzzle) + for idx, sol in enumerate(solutions, start=1): + sol_str = ["True" if s else "False" for s in sol] + answer = len(solutions) + + return { + "question": puzz_s, + "answer": answer, + "metadata": {}, + } + + def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float: + """Determine if the solution provided solves the SelfReference task. + + The function awards 1.0 for a correct answer. + + Args: + answer (Optional[str]): The user's answer. + entry (Dict[str, any]): The original dataset entry containing the correct answer. + + Returns: + float: The computed score between 0.0 and 1.0. + """ + + if answer == None: + return 0.0 + if str(answer) != str(entry["answer"]): + return 0.1 + else: + return 1.0 # Yay + + +register_dataset("self_reference", SelfReferenceDataset, SelfReferenceConfig) diff --git a/tests/test_self_reference.py b/tests/test_self_reference.py new file mode 100644 index 00000000..66f15081 --- /dev/null +++ b/tests/test_self_reference.py @@ -0,0 +1,55 @@ +import pytest + +from reasoning_gym.logic.self_reference import SelfReferenceConfig, SelfReferenceDataset + + +def test_self_reference(): + """Test basic properties and solution of generated items""" + + # Easy + config = SelfReferenceConfig(seed=42, size=20, difficulty=1) + dataset = SelfReferenceDataset(config) + + for item in dataset: + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Test the scoring + assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0 + assert dataset.score_answer(answer=99, entry=item) == 0.1 + assert dataset.score_answer(answer="99", entry=item) == 0.1 + assert dataset.score_answer(answer=None, entry=item) == 0.0 + + # # Medium + config = SelfReferenceConfig(seed=42, size=1, difficulty=5) + dataset = SelfReferenceDataset(config) + + for item in dataset: + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Test the scoring + assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0 + assert dataset.score_answer(answer=99, entry=item) == 0.1 + assert dataset.score_answer(answer="99", entry=item) == 0.1 + assert dataset.score_answer(answer=None, entry=item) == 0.0 + + # # Hard + config = SelfReferenceConfig(seed=42, size=1, difficulty=10) + dataset = SelfReferenceDataset(config) + + for item in dataset: + assert isinstance(item, dict) + assert "question" in item + assert "answer" in item + assert "metadata" in item + + # Test the scoring + assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0 + assert dataset.score_answer(answer=99, entry=item) == 0.1 + assert dataset.score_answer(answer="99", entry=item) == 0.1 + assert dataset.score_answer(answer=None, entry=item) == 0.0 diff --git a/tests/test_tsumego.py b/tests/test_tsumego.py index 86ac203f..c1d8d7b6 100644 --- a/tests/test_tsumego.py +++ b/tests/test_tsumego.py @@ -1,8 +1,9 @@ """Tests for Ttsumego problem generation""" -import pytest from random import Random +import pytest + from reasoning_gym.games.tsumego import TsumegoConfig, TsumegoDataset