diff --git a/README.md b/README.md
index c329d21d..f9fb86a2 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
- `SimpleEquationsDataset`: Generate linear equations with one variable to solve (e.g. "3\*x + 2 = 14")
- `PolynomialEquationsDataset`: Generate polynomial equations with one variable to solve (e.g. "-6*h\*\*4 + 4*h\**2 - 5*h = 0")
-- `PolynomialMultiplicationDataset`: Generate polynomial multiplicatons (e.g. "(8x^3 + x + 2)*(y - 3)")
+- `PolynomialMultiplicationDataset`: Generate polynomial multiplicatons (e.g. "(8x^3 + x + 2)\*(y - 3)")
### Arithmetic Tasks
@@ -118,6 +118,7 @@ See the [Dataset Gallery](GALLERY.md) for a complete list of available datasets
- `SyllogismDataset`: Generates a [syllogism](https://en.wikipedia.org/wiki/Syllogism) reasoning dataset
- `AliceInWonderlandDataset`: Generates [AIW](https://openreview.net/forum?id=Mkl7dzjYiW) (Alice In Wonderland) problems with a few variations
- `ZebraDataset`: Generates [Zebra Puzzles](https://en.wikipedia.org/wiki/Zebra_Puzzle) of varying difficulty.
+- `SelfReferenceDataset`: Generates self-referencing logic puzzles.
### Graph Tasks
diff --git a/reasoning_gym/logic/__init__.py b/reasoning_gym/logic/__init__.py
index dfa1c7ad..c05c4dba 100644
--- a/reasoning_gym/logic/__init__.py
+++ b/reasoning_gym/logic/__init__.py
@@ -4,6 +4,7 @@ Logic tasks for training reasoning capabilities.
from .aiw import AliceInWonderlandConfig, AliceInWonderlandDataset
from .propositional_logic import PropositionalLogicConfig, PropositionalLogicDataset
+from .self_reference import SelfReferenceConfig, SelfReferenceDataset
from .syllogisms import SyllogismConfig, SyllogismDataset, Term
from .zebra_puzzles import ZebraConfig, ZebraDataset
@@ -18,4 +19,6 @@ __all__ = [
"Term",
"ZebraConfig",
"ZebraDataset",
+ "SelfReference",
+ "SelfReferenceDataset",
]
diff --git a/reasoning_gym/logic/self_reference.py b/reasoning_gym/logic/self_reference.py
new file mode 100644
index 00000000..045fc22e
--- /dev/null
+++ b/reasoning_gym/logic/self_reference.py
@@ -0,0 +1,374 @@
+from dataclasses import dataclass
+from random import Random
+from typing import Dict, Optional
+
+from ..factory import ProceduralDataset, register_dataset
+from .contrib.logic_puzzle.generate import generate_puzzle
+
+
+def is_prime(n):
+ """Return True if n is a prime number, False otherwise."""
+ if n < 2:
+ return False
+ for i in range(2, int(n**0.5) + 1):
+ if n % i == 0:
+ return False
+ return True
+
+
+def is_composite(n):
+ """
+ Return True if n is composite.
+ (Composite means an integer greater than 1 that is not prime.)
+ """
+ return n > 1 and not is_prime(n)
+
+
+def generate_dynamic_puzzle(difficulty, rng):
+ """
+ Dynamically generates a 7-statement self-referential puzzle.
+
+ The seven statements (with parameters determined by this function) are:
+
+ 1. "At least a of these 7 statements are true."
+ 2. "At most b of these 7 statements are false."
+ 3. "Exactly c of these 7 statements are true."
+ 4. "Exactly d of these 7 statements are false."
+ 5. "Either Statement 3 or Statement 4 is true, but not both."
+ 6. "The number of true statements is a prime number."
+ 7. "The number of false statements is a composite number."
+
+ The idea is to choose an intended number T (1 ≤ T ≤ 6) of true statements
+ and then “plant” an intended solution. In our construction the truth values
+ for Statements 6 and 7 are forced by T (e.g. Statement 6 should be true exactly
+ when T is prime). For the first four statements the numeric parameters (a, b, c, d)
+ are chosen so that the statement evaluates correctly when compared to T.
+
+ The difficulty parameter (an integer, e.g. 1 for easy up to 10 for hard)
+ influences how “borderline” the numeric choices are. At lower difficulty the numbers
+ are chosen with a clear gap; at higher difficulty they are chosen closer to T.
+
+ Returns:
+ dict: A puzzle dictionary containing:
+ - 'n': number of statements (always 7 here),
+ - 'statements_text': a list of 7 strings (one per statement),
+ - 'parameters': a dict with the numeric parameters (for statements 1-4),
+ - 'intended_assignment': the intended truth values (list of 7 booleans),
+ - 'intended_T': the intended number of true statements.
+ """
+ n = 7
+
+ # Choose an intended number of true statements, T, from 1 to 6 (nontrivial).
+ T = rng.choice(range(1, n))
+
+ # For the global statements (6 and 7), the intended truth is forced:
+ intended6 = is_prime(T) # Statement 6 must be true if T is prime.
+ intended7 = is_composite(n - T) # Statement 7 must be true if (# false) is composite.
+
+ # Among statements 1-5, we need exactly k trues such that overall the total becomes T.
+ # Let k = T - (truth from statements 6 and 7).
+ forced_true_count = (1 if intended6 else 0) + (1 if intended7 else 0)
+ k = T - forced_true_count
+ # k must be between 0 and 5.
+ if not (0 <= k <= 5):
+ # If for some reason it is not in range, fall back to a known configuration (T=4).
+ T = 4
+ intended6 = False
+ intended7 = False
+ k = 4 # so that overall T=4.
+ intended_assignment_15 = [True, True, True, True, False]
+ else:
+ # For statements 1-5, randomly choose which ones are intended true.
+ # We'll index these as 0..4 corresponding to statements 1..5.
+ intended_assignment_15 = [False] * 5
+ if k > 0:
+ true_indices = set(rng.sample(range(5), k))
+ for i in true_indices:
+ intended_assignment_15[i] = True
+
+ # Now, for statements 1-4, choose numeric parameters based on whether the statement is
+ # intended to be true or false. We use the difficulty parameter to control the "margin."
+ #
+ # For statement 1: "At least a of these 7 statements are true."
+ # The condition is: T >= a.
+ def choose_at_least_param(T, intended, diff, rng):
+ # diff will be used as a margin factor: lower diff => wider gap.
+ if intended: # must have a <= T.
+ # At easy difficulty, choose a clearly below T (if possible).
+ low = 1
+ high = T
+ # At lower difficulty, bias toward the lower end.
+ return rng.randint(low, high)
+ else: # must have a > T.
+ low = T + 1
+ high = n # a can be at most n.
+ if low > high:
+ return n
+ return rng.randint(low, high)
+
+ a_param = choose_at_least_param(T, intended_assignment_15[0], difficulty, rng)
+
+ # For statement 2: "At most b of these 7 statements are false."
+ # F = n - T, so condition is: (n - T) <= b <=> T >= n - b.
+ def choose_at_most_param(T, intended, diff, rng):
+ if intended: # b must be >= n - T.
+ low = n - T
+ high = n
+ return rng.randint(low, high)
+ else:
+ # b must be < n - T.
+ low = 0
+ high = max(n - T - 1, 0)
+ return rng.randint(low, high)
+
+ b_param = choose_at_most_param(T, intended_assignment_15[1], difficulty, rng)
+
+ # For statement 3: "Exactly c of these 7 statements are true."
+ def choose_exactly_true_param(T, intended, diff, rng):
+ if intended:
+ return T
+ else:
+ choices = [x for x in range(0, n + 1) if x != T]
+ return rng.choice(choices)
+
+ c_param = choose_exactly_true_param(T, intended_assignment_15[2], difficulty, rng)
+
+ # For statement 4: "Exactly d of these 7 statements are false."
+ # Condition: (n - T) == d.
+ def choose_exactly_false_param(T, intended, diff, rng):
+ false_count = n - T
+ if intended:
+ return false_count
+ else:
+ choices = [x for x in range(0, n + 1) if x != false_count]
+ return rng.choice(choices)
+
+ d_param = choose_exactly_false_param(T, intended_assignment_15[3], difficulty, rng)
+
+ # For statement 5: "Either Statement 3 or Statement 4 is true, but not both."
+ # We do not need a parameter here; the intended condition is that the truth values for
+ # statements 3 and 4 (which are positions 2 and 3 in our 0-indexed list) differ.
+ # The intended truth for statement 5 is taken from our assignment.
+ # (Later the verification function will check: solution[2] != solution[3].)
+
+ # Build the intended assignment for all 7 statements.
+ # For statements 1-5, we use our generated intended_assignment_15.
+ intended_assignment = [
+ intended_assignment_15[0],
+ intended_assignment_15[1],
+ intended_assignment_15[2],
+ intended_assignment_15[3],
+ intended_assignment_15[4],
+ intended6,
+ intended7,
+ ]
+
+ # (If the total intended true count doesn't equal T, adjust statement 5.)
+ current_T = sum(intended_assignment)
+ if current_T != T:
+ # Since only statement 5 is free (its parameter wasn't numeric),
+ # force its intended truth to be what is needed.
+ intended_assignment[4] = T - (current_T - (1 if intended_assignment[4] else 0)) == 1
+
+ # Now build the text for each statement.
+ statements_text = [
+ f"Statement 1: 'At least {a_param} of these 7 statements are true.'",
+ f"Statement 2: 'At most {b_param} of these 7 statements are false.'",
+ f"Statement 3: 'Exactly {c_param} of these 7 statements are true.'",
+ f"Statement 4: 'Exactly {d_param} of these 7 statements are false.'",
+ "Statement 5: 'Either Statement 3 or Statement 4 is true, but not both.'",
+ "Statement 6: 'The number of true statements is a prime number.'",
+ "Statement 7: 'The number of false statements is a composite number.'",
+ ]
+
+ return {
+ "n": n,
+ "statements_text": statements_text,
+ "parameters": {
+ "a": a_param,
+ "b": b_param,
+ "c": c_param,
+ "d": d_param,
+ },
+ "intended_assignment": intended_assignment,
+ "intended_T": T,
+ "difficulty": difficulty,
+ }
+
+
+def verify_solution_dynamic(puzzle, solution):
+ """
+ Verifies a candidate solution for a dynamically generated puzzle.
+
+ The rules are:
+ - If a statement is marked True, then its claim must hold.
+ - If a statement is marked False, then its claim must fail.
+
+ The conditions are as follows:
+ 1. "At least a of these 7 statements are true." => (T >= a)
+ 2. "At most b of these 7 statements are false." => (F <= b)
+ 3. "Exactly c of these 7 statements are true." => (T == c)
+ 4. "Exactly d of these 7 statements are false." => (F == d)
+ 5. "Either Statement 3 or Statement 4 is true, but not both." => (solution[2] != solution[3])
+ 6. "The number of true statements is a prime number." => is_prime(T)
+ 7. "The number of false statements is a composite number." => is_composite(F)
+
+ Parameters:
+ puzzle (dict): The puzzle dictionary returned by generate_dynamic_puzzle.
+ solution (list of bool): A candidate assignment (length 7).
+
+ Returns:
+ bool: True if candidate is self-consistent; False otherwise.
+ """
+ n = puzzle["n"]
+ if len(solution) != n:
+ return False
+ T = sum(solution)
+ F = n - T
+ params = puzzle["parameters"]
+
+ # Statement 1: "At least a of these 7 statements are true."
+ cond1 = T >= params["a"]
+ if solution[0] and not cond1:
+ return False
+ if not solution[0] and cond1:
+ return False
+
+ # Statement 2: "At most b of these 7 statements are false."
+ cond2 = F <= params["b"]
+ if solution[1] and not cond2:
+ return False
+ if not solution[1] and cond2:
+ return False
+
+ # Statement 3: "Exactly c of these 7 statements are true."
+ cond3 = T == params["c"]
+ if solution[2] and not cond3:
+ return False
+ if not solution[2] and cond3:
+ return False
+
+ # Statement 4: "Exactly d of these 7 statements are false."
+ cond4 = F == params["d"]
+ if solution[3] and not cond4:
+ return False
+ if not solution[3] and cond4:
+ return False
+
+ # Statement 5: "Either Statement 3 or Statement 4 is true, but not both."
+ cond5 = solution[2] != solution[3]
+ if solution[4] and not cond5:
+ return False
+ if not solution[4] and cond5:
+ return False
+
+ # Statement 6: "The number of true statements is a prime number."
+ cond6 = is_prime(T)
+ if solution[5] and not cond6:
+ return False
+ if not solution[5] and cond6:
+ return False
+
+ # Statement 7: "The number of false statements is a composite number."
+ cond7 = is_composite(F)
+ if solution[6] and not cond7:
+ return False
+ if not solution[6] and cond7:
+ return False
+
+ return True
+
+
+def print_puzzle_dynamic(puzzle):
+ """Prints the dynamically generated puzzle."""
+ x = ""
+ for stmt in puzzle["statements_text"]:
+ x = x + " - " + stmt + "\n"
+ return x
+
+
+def solve_puzzle_dynamic(puzzle):
+ """
+ Searches all 2^7 possible truth assignments and returns those that
+ are self-consistent with the generated puzzle.
+ """
+ n = puzzle["n"]
+ valid_solutions = []
+ for i in range(2**n):
+ candidate = [(i >> j) & 1 == 1 for j in range(n)]
+ if verify_solution_dynamic(puzzle, candidate):
+ valid_solutions.append(candidate)
+ return valid_solutions
+
+
+@dataclass
+class SelfReferenceConfig:
+ """Configuration for SelfReference puzzle generation"""
+
+ difficulty: int = 5
+ seed: Optional[int] = None
+ size: int = 500
+
+ def validate(self):
+ """Validate configuration parameters"""
+ assert 1 <= self.difficulty <= 10, "difficulty must be between 1 and 10"
+
+
+class SelfReferenceDataset(ProceduralDataset):
+ """Generates self-referential puzzles"""
+
+ def __init__(self, config: SelfReferenceConfig):
+ super().__init__(config=config, seed=config.seed, size=config.size)
+
+ def __getitem__(self, idx: int) -> dict:
+ """Generate a single SelfReference task
+
+ Returns:
+ dict with keys:
+ - question: str, the task description
+ - answer: str, a solution string
+ - metadata: dict with generation parameters
+ """
+ rng = Random(self.seed + idx)
+
+ # Generate puzzle
+ puzzle = generate_dynamic_puzzle(self.config.difficulty, rng)
+ puzz_s = (
+ "Given the truthfulness of these statements, please tell me the number of possible solutions: \n"
+ + print_puzzle_dynamic(puzzle)
+ )
+
+ # Solve puzzle
+ solutions = solve_puzzle_dynamic(puzzle)
+ for idx, sol in enumerate(solutions, start=1):
+ sol_str = ["True" if s else "False" for s in sol]
+ answer = len(solutions)
+
+ return {
+ "question": puzz_s,
+ "answer": answer,
+ "metadata": {},
+ }
+
+ def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
+ """Determine if the solution provided solves the SelfReference task.
+
+ The function awards 1.0 for a correct answer.
+
+ Args:
+ answer (Optional[str]): The user's answer.
+ entry (Dict[str, any]): The original dataset entry containing the correct answer.
+
+ Returns:
+ float: The computed score between 0.0 and 1.0.
+ """
+
+ if answer == None:
+ return 0.0
+ if str(answer) != str(entry["answer"]):
+ return 0.1
+ else:
+ return 1.0 # Yay
+
+
+register_dataset("self_reference", SelfReferenceDataset, SelfReferenceConfig)
diff --git a/tests/test_self_reference.py b/tests/test_self_reference.py
new file mode 100644
index 00000000..66f15081
--- /dev/null
+++ b/tests/test_self_reference.py
@@ -0,0 +1,55 @@
+import pytest
+
+from reasoning_gym.logic.self_reference import SelfReferenceConfig, SelfReferenceDataset
+
+
+def test_self_reference():
+ """Test basic properties and solution of generated items"""
+
+ # Easy
+ config = SelfReferenceConfig(seed=42, size=20, difficulty=1)
+ dataset = SelfReferenceDataset(config)
+
+ for item in dataset:
+ assert isinstance(item, dict)
+ assert "question" in item
+ assert "answer" in item
+ assert "metadata" in item
+
+ # Test the scoring
+ assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
+ assert dataset.score_answer(answer=99, entry=item) == 0.1
+ assert dataset.score_answer(answer="99", entry=item) == 0.1
+ assert dataset.score_answer(answer=None, entry=item) == 0.0
+
+ # # Medium
+ config = SelfReferenceConfig(seed=42, size=1, difficulty=5)
+ dataset = SelfReferenceDataset(config)
+
+ for item in dataset:
+ assert isinstance(item, dict)
+ assert "question" in item
+ assert "answer" in item
+ assert "metadata" in item
+
+ # Test the scoring
+ assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
+ assert dataset.score_answer(answer=99, entry=item) == 0.1
+ assert dataset.score_answer(answer="99", entry=item) == 0.1
+ assert dataset.score_answer(answer=None, entry=item) == 0.0
+
+ # # Hard
+ config = SelfReferenceConfig(seed=42, size=1, difficulty=10)
+ dataset = SelfReferenceDataset(config)
+
+ for item in dataset:
+ assert isinstance(item, dict)
+ assert "question" in item
+ assert "answer" in item
+ assert "metadata" in item
+
+ # Test the scoring
+ assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
+ assert dataset.score_answer(answer=99, entry=item) == 0.1
+ assert dataset.score_answer(answer="99", entry=item) == 0.1
+ assert dataset.score_answer(answer=None, entry=item) == 0.0
diff --git a/tests/test_tsumego.py b/tests/test_tsumego.py
index 86ac203f..c1d8d7b6 100644
--- a/tests/test_tsumego.py
+++ b/tests/test_tsumego.py
@@ -1,8 +1,9 @@
"""Tests for Ttsumego problem generation"""
-import pytest
from random import Random
+import pytest
+
from reasoning_gym.games.tsumego import TsumegoConfig, TsumegoDataset