From bd8fc9beeb6b9af5f3f60b42e28078c6c2dd3bb2 Mon Sep 17 00:00:00 2001
From: Rich Jones <miserlou@gmail.com>
Date: Fri, 7 Feb 2025 15:09:42 +0100
Subject: [PATCH] add self-reference puzzles

---
 README.md                             |   3 +-
 reasoning_gym/logic/__init__.py       |   3 +
 reasoning_gym/logic/self_reference.py | 374 ++++++++++++++++++++++++++
 tests/test_self_reference.py          |  55 ++++
 tests/test_tsumego.py                 |   3 +-
 5 files changed, 436 insertions(+), 2 deletions(-)
 create mode 100644 reasoning_gym/logic/self_reference.py
 create mode 100644 tests/test_self_reference.py
diff --git a/README.md b/README.md
index c329d21d..f9fb86a2 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
 
 - `SimpleEquationsDataset`: Generate linear equations with one variable to solve (e.g. "3\*x + 2 = 14")
 - `PolynomialEquationsDataset`: Generate polynomial equations with one variable to solve (e.g. "-6*h\*\*4 + 4*h\**2 - 5*h = 0")
-- `PolynomialMultiplicationDataset`: Generate polynomial multiplicatons (e.g. "(8x^3 + x + 2)*(y - 3)")
+- `PolynomialMultiplicationDataset`: Generate polynomial multiplicatons (e.g. "(8x^3 + x + 2)\*(y - 3)")
 
 ### <small>Arithmetic Tasks</small>
 
@@ -118,6 +118,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
 - `SyllogismDataset`: Generates a [syllogism](https://en.wikipedia.org/wiki/Syllogism) reasoning dataset
 - `AliceInWonderlandDataset`: Generates [AIW](https://openreview.net/forum?id=Mkl7dzjYiW) (Alice In Wonderland) problems with a few variations
 - `ZebraDataset`: Generates [Zebra Puzzles](https://en.wikipedia.org/wiki/Zebra_Puzzle) of varying difficulty.
+- `SelfReferenceDataset`: Generates self-referencing logic puzzles.
 
 ### <small>Graph Tasks</small>
 
diff --git a/reasoning_gym/logic/__init__.py b/reasoning_gym/logic/__init__.py
index dfa1c7ad..c05c4dba 100644
--- a/reasoning_gym/logic/__init__.py
+++ b/reasoning_gym/logic/__init__.py
@@ -4,6 +4,7 @@ Logic tasks for training reasoning capabilities.
 
 from .aiw import AliceInWonderlandConfig, AliceInWonderlandDataset
 from .propositional_logic import PropositionalLogicConfig, PropositionalLogicDataset
+from .self_reference import SelfReferenceConfig, SelfReferenceDataset
 from .syllogisms import SyllogismConfig, SyllogismDataset, Term
 from .zebra_puzzles import ZebraConfig, ZebraDataset
 
@@ -18,4 +19,6 @@ __all__ = [
     "Term",
     "ZebraConfig",
     "ZebraDataset",
+    "SelfReference",
+    "SelfReferenceDataset",
 ]
diff --git a/reasoning_gym/logic/self_reference.py b/reasoning_gym/logic/self_reference.py
new file mode 100644
index 00000000..045fc22e
--- /dev/null
+++ b/reasoning_gym/logic/self_reference.py
@@ -0,0 +1,374 @@
+from dataclasses import dataclass
+from random import Random
+from typing import Dict, Optional
+
+from ..factory import ProceduralDataset, register_dataset
+from .contrib.logic_puzzle.generate import generate_puzzle
+
+
+def is_prime(n):
+    """Return True if n is a prime number, False otherwise."""
+    if n < 2:
+        return False
+    for i in range(2, int(n**0.5) + 1):
+        if n % i == 0:
+            return False
+    return True
+
+
+def is_composite(n):
+    """
+    Return True if n is composite.
+    (Composite means an integer greater than 1 that is not prime.)
+    """
+    return n > 1 and not is_prime(n)
+
+
+def generate_dynamic_puzzle(difficulty, rng):
+    """
+    Dynamically generates a 7-statement self-referential puzzle.
+
+    The seven statements (with parameters determined by this function) are:
+
+      1. "At least a of these 7 statements are true."
+      2. "At most b of these 7 statements are false."
+      3. "Exactly c of these 7 statements are true."
+      4. "Exactly d of these 7 statements are false."
+      5. "Either Statement 3 or Statement 4 is true, but not both."
+      6. "The number of true statements is a prime number."
+      7. "The number of false statements is a composite number."
+
+    The idea is to choose an intended number T (1 ≤ T ≤ 6) of true statements
+    and then “plant” an intended solution. In our construction the truth values
+    for Statements 6 and 7 are forced by T (e.g. Statement 6 should be true exactly
+    when T is prime). For the first four statements the numeric parameters (a, b, c, d)
+    are chosen so that the statement evaluates correctly when compared to T.
+
+    The difficulty parameter (an integer, e.g. 1 for easy up to 10 for hard)
+    influences how “borderline” the numeric choices are. At lower difficulty the numbers
+    are chosen with a clear gap; at higher difficulty they are chosen closer to T.
+
+    Returns:
+        dict: A puzzle dictionary containing:
+              - 'n': number of statements (always 7 here),
+              - 'statements_text': a list of 7 strings (one per statement),
+              - 'parameters': a dict with the numeric parameters (for statements 1-4),
+              - 'intended_assignment': the intended truth values (list of 7 booleans),
+              - 'intended_T': the intended number of true statements.
+    """
+    n = 7
+
+    # Choose an intended number of true statements, T, from 1 to 6 (nontrivial).
+    T = rng.choice(range(1, n))
+
+    # For the global statements (6 and 7), the intended truth is forced:
+    intended6 = is_prime(T)  # Statement 6 must be true if T is prime.
+    intended7 = is_composite(n - T)  # Statement 7 must be true if (# false) is composite.
+
+    # Among statements 1-5, we need exactly k trues such that overall the total becomes T.
+    # Let k = T - (truth from statements 6 and 7).
+    forced_true_count = (1 if intended6 else 0) + (1 if intended7 else 0)
+    k = T - forced_true_count
+    # k must be between 0 and 5.
+    if not (0 <= k <= 5):
+        # If for some reason it is not in range, fall back to a known configuration (T=4).
+        T = 4
+        intended6 = False
+        intended7 = False
+        k = 4  # so that overall T=4.
+        intended_assignment_15 = [True, True, True, True, False]
+    else:
+        # For statements 1-5, randomly choose which ones are intended true.
+        # We'll index these as 0..4 corresponding to statements 1..5.
+        intended_assignment_15 = [False] * 5
+        if k > 0:
+            true_indices = set(rng.sample(range(5), k))
+            for i in true_indices:
+                intended_assignment_15[i] = True
+
+    # Now, for statements 1-4, choose numeric parameters based on whether the statement is
+    # intended to be true or false. We use the difficulty parameter to control the "margin."
+    #
+    # For statement 1: "At least a of these 7 statements are true."
+    # The condition is: T >= a.
+    def choose_at_least_param(T, intended, diff, rng):
+        # diff will be used as a margin factor: lower diff => wider gap.
+        if intended:  # must have a <= T.
+            # At easy difficulty, choose a clearly below T (if possible).
+            low = 1
+            high = T
+            # At lower difficulty, bias toward the lower end.
+            return rng.randint(low, high)
+        else:  # must have a > T.
+            low = T + 1
+            high = n  # a can be at most n.
+            if low > high:
+                return n
+            return rng.randint(low, high)
+
+    a_param = choose_at_least_param(T, intended_assignment_15[0], difficulty, rng)
+
+    # For statement 2: "At most b of these 7 statements are false."
+    # F = n - T, so condition is: (n - T) <= b   <=>   T >= n - b.
+    def choose_at_most_param(T, intended, diff, rng):
+        if intended:  # b must be >= n - T.
+            low = n - T
+            high = n
+            return rng.randint(low, high)
+        else:
+            # b must be < n - T.
+            low = 0
+            high = max(n - T - 1, 0)
+            return rng.randint(low, high)
+
+    b_param = choose_at_most_param(T, intended_assignment_15[1], difficulty, rng)
+
+    # For statement 3: "Exactly c of these 7 statements are true."
+    def choose_exactly_true_param(T, intended, diff, rng):
+        if intended:
+            return T
+        else:
+            choices = [x for x in range(0, n + 1) if x != T]
+            return rng.choice(choices)
+
+    c_param = choose_exactly_true_param(T, intended_assignment_15[2], difficulty, rng)
+
+    # For statement 4: "Exactly d of these 7 statements are false."
+    # Condition: (n - T) == d.
+    def choose_exactly_false_param(T, intended, diff, rng):
+        false_count = n - T
+        if intended:
+            return false_count
+        else:
+            choices = [x for x in range(0, n + 1) if x != false_count]
+            return rng.choice(choices)
+
+    d_param = choose_exactly_false_param(T, intended_assignment_15[3], difficulty, rng)
+
+    # For statement 5: "Either Statement 3 or Statement 4 is true, but not both."
+    # We do not need a parameter here; the intended condition is that the truth values for
+    # statements 3 and 4 (which are positions 2 and 3 in our 0-indexed list) differ.
+    # The intended truth for statement 5 is taken from our assignment.
+    # (Later the verification function will check: solution[2] != solution[3].)
+
+    # Build the intended assignment for all 7 statements.
+    # For statements 1-5, we use our generated intended_assignment_15.
+    intended_assignment = [
+        intended_assignment_15[0],
+        intended_assignment_15[1],
+        intended_assignment_15[2],
+        intended_assignment_15[3],
+        intended_assignment_15[4],
+        intended6,
+        intended7,
+    ]
+
+    # (If the total intended true count doesn't equal T, adjust statement 5.)
+    current_T = sum(intended_assignment)
+    if current_T != T:
+        # Since only statement 5 is free (its parameter wasn't numeric),
+        # force its intended truth to be what is needed.
+        intended_assignment[4] = T - (current_T - (1 if intended_assignment[4] else 0)) == 1
+
+    # Now build the text for each statement.
+    statements_text = [
+        f"Statement 1: 'At least {a_param} of these 7 statements are true.'",
+        f"Statement 2: 'At most {b_param} of these 7 statements are false.'",
+        f"Statement 3: 'Exactly {c_param} of these 7 statements are true.'",
+        f"Statement 4: 'Exactly {d_param} of these 7 statements are false.'",
+        "Statement 5: 'Either Statement 3 or Statement 4 is true, but not both.'",
+        "Statement 6: 'The number of true statements is a prime number.'",
+        "Statement 7: 'The number of false statements is a composite number.'",
+    ]
+
+    return {
+        "n": n,
+        "statements_text": statements_text,
+        "parameters": {
+            "a": a_param,
+            "b": b_param,
+            "c": c_param,
+            "d": d_param,
+        },
+        "intended_assignment": intended_assignment,
+        "intended_T": T,
+        "difficulty": difficulty,
+    }
+
+
+def verify_solution_dynamic(puzzle, solution):
+    """
+    Verifies a candidate solution for a dynamically generated puzzle.
+
+    The rules are:
+      - If a statement is marked True, then its claim must hold.
+      - If a statement is marked False, then its claim must fail.
+
+    The conditions are as follows:
+      1. "At least a of these 7 statements are true."  => (T >= a)
+      2. "At most b of these 7 statements are false."   => (F <= b)
+      3. "Exactly c of these 7 statements are true."    => (T == c)
+      4. "Exactly d of these 7 statements are false."   => (F == d)
+      5. "Either Statement 3 or Statement 4 is true, but not both." => (solution[2] != solution[3])
+      6. "The number of true statements is a prime number." => is_prime(T)
+      7. "The number of false statements is a composite number." => is_composite(F)
+
+    Parameters:
+       puzzle (dict): The puzzle dictionary returned by generate_dynamic_puzzle.
+       solution (list of bool): A candidate assignment (length 7).
+
+    Returns:
+       bool: True if candidate is self-consistent; False otherwise.
+    """
+    n = puzzle["n"]
+    if len(solution) != n:
+        return False
+    T = sum(solution)
+    F = n - T
+    params = puzzle["parameters"]
+
+    # Statement 1: "At least a of these 7 statements are true."
+    cond1 = T >= params["a"]
+    if solution[0] and not cond1:
+        return False
+    if not solution[0] and cond1:
+        return False
+
+    # Statement 2: "At most b of these 7 statements are false."
+    cond2 = F <= params["b"]
+    if solution[1] and not cond2:
+        return False
+    if not solution[1] and cond2:
+        return False
+
+    # Statement 3: "Exactly c of these 7 statements are true."
+    cond3 = T == params["c"]
+    if solution[2] and not cond3:
+        return False
+    if not solution[2] and cond3:
+        return False
+
+    # Statement 4: "Exactly d of these 7 statements are false."
+    cond4 = F == params["d"]
+    if solution[3] and not cond4:
+        return False
+    if not solution[3] and cond4:
+        return False
+
+    # Statement 5: "Either Statement 3 or Statement 4 is true, but not both."
+    cond5 = solution[2] != solution[3]
+    if solution[4] and not cond5:
+        return False
+    if not solution[4] and cond5:
+        return False
+
+    # Statement 6: "The number of true statements is a prime number."
+    cond6 = is_prime(T)
+    if solution[5] and not cond6:
+        return False
+    if not solution[5] and cond6:
+        return False
+
+    # Statement 7: "The number of false statements is a composite number."
+    cond7 = is_composite(F)
+    if solution[6] and not cond7:
+        return False
+    if not solution[6] and cond7:
+        return False
+
+    return True
+
+
+def print_puzzle_dynamic(puzzle):
+    """Prints the dynamically generated puzzle."""
+    x = ""
+    for stmt in puzzle["statements_text"]:
+        x = x + " - " + stmt + "\n"
+    return x
+
+
+def solve_puzzle_dynamic(puzzle):
+    """
+    Searches all 2^7 possible truth assignments and returns those that
+    are self-consistent with the generated puzzle.
+    """
+    n = puzzle["n"]
+    valid_solutions = []
+    for i in range(2**n):
+        candidate = [(i >> j) & 1 == 1 for j in range(n)]
+        if verify_solution_dynamic(puzzle, candidate):
+            valid_solutions.append(candidate)
+    return valid_solutions
+
+
+@dataclass
+class SelfReferenceConfig:
+    """Configuration for SelfReference puzzle generation"""
+
+    difficulty: int = 5
+    seed: Optional[int] = None
+    size: int = 500
+
+    def validate(self):
+        """Validate configuration parameters"""
+        assert 1 <= self.difficulty <= 10, "difficulty must be between 1 and 10"
+
+
+class SelfReferenceDataset(ProceduralDataset):
+    """Generates self-referential puzzles"""
+
+    def __init__(self, config: SelfReferenceConfig):
+        super().__init__(config=config, seed=config.seed, size=config.size)
+
+    def __getitem__(self, idx: int) -> dict:
+        """Generate a single SelfReference task
+
+        Returns:
+            dict with keys:
+                - question: str, the task description
+                - answer: str, a solution string
+                - metadata: dict with generation parameters
+        """
+        rng = Random(self.seed + idx)
+
+        # Generate puzzle
+        puzzle = generate_dynamic_puzzle(self.config.difficulty, rng)
+        puzz_s = (
+            "Given the truthfulness of these statements, please tell me the number of possible solutions: \n"
+            + print_puzzle_dynamic(puzzle)
+        )
+
+        # Solve puzzle
+        solutions = solve_puzzle_dynamic(puzzle)
+        for idx, sol in enumerate(solutions, start=1):
+            sol_str = ["True" if s else "False" for s in sol]
+        answer = len(solutions)
+
+        return {
+            "question": puzz_s,
+            "answer": answer,
+            "metadata": {},
+        }
+
+    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+        """Determine if the solution provided solves the SelfReference task.
+
+        The function awards 1.0 for a correct answer.
+
+        Args:
+            answer (Optional[str]): The user's answer.
+            entry (Dict[str, any]): The original dataset entry containing the correct answer.
+
+        Returns:
+            float: The computed score between 0.0 and 1.0.
+        """
+
+        if answer == None:
+            return 0.0
+        if str(answer) != str(entry["answer"]):
+            return 0.1
+        else:
+            return 1.0  # Yay
+
+
+register_dataset("self_reference", SelfReferenceDataset, SelfReferenceConfig)
diff --git a/tests/test_self_reference.py b/tests/test_self_reference.py
new file mode 100644
index 00000000..66f15081
--- /dev/null
+++ b/tests/test_self_reference.py
@@ -0,0 +1,55 @@
+import pytest
+
+from reasoning_gym.logic.self_reference import SelfReferenceConfig, SelfReferenceDataset
+
+
+def test_self_reference():
+    """Test basic properties and solution of generated items"""
+
+    # Easy
+    config = SelfReferenceConfig(seed=42, size=20, difficulty=1)
+    dataset = SelfReferenceDataset(config)
+
+    for item in dataset:
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        # Test the scoring
+        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
+        assert dataset.score_answer(answer=99, entry=item) == 0.1
+        assert dataset.score_answer(answer="99", entry=item) == 0.1
+        assert dataset.score_answer(answer=None, entry=item) == 0.0
+
+    # # Medium
+    config = SelfReferenceConfig(seed=42, size=1, difficulty=5)
+    dataset = SelfReferenceDataset(config)
+
+    for item in dataset:
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        # Test the scoring
+        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
+        assert dataset.score_answer(answer=99, entry=item) == 0.1
+        assert dataset.score_answer(answer="99", entry=item) == 0.1
+        assert dataset.score_answer(answer=None, entry=item) == 0.0
+
+    # # Hard
+    config = SelfReferenceConfig(seed=42, size=1, difficulty=10)
+    dataset = SelfReferenceDataset(config)
+
+    for item in dataset:
+        assert isinstance(item, dict)
+        assert "question" in item
+        assert "answer" in item
+        assert "metadata" in item
+
+        # Test the scoring
+        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
+        assert dataset.score_answer(answer=99, entry=item) == 0.1
+        assert dataset.score_answer(answer="99", entry=item) == 0.1
+        assert dataset.score_answer(answer=None, entry=item) == 0.0
diff --git a/tests/test_tsumego.py b/tests/test_tsumego.py
index 86ac203f..c1d8d7b6 100644
--- a/tests/test_tsumego.py
+++ b/tests/test_tsumego.py
@@ -1,8 +1,9 @@
 """Tests for Ttsumego problem generation"""
 
-import pytest
 from random import Random
 
+import pytest
+
 from reasoning_gym.games.tsumego import TsumegoConfig, TsumegoDataset