Merge branch 'main' into rich/ab

2026-04-19 12:58:07 +00:00 · 2025-02-11 23:34:48 +01:00 · 2025-02-11 23:34:48 +01:00 · 27938ce13a
commit 27938ce13a
parent d44d076ae3 626b5aa007
16 changed files with 759 additions and 12 deletions
--- a/reasoning_gym/algorithmic/init.py
+++ b/reasoning_gym/algorithmic/init.py
@ -10,6 +10,7 @@ from .ab import ABConfig, ABDataset
 from .base_conversion import BaseConversionConfig, BaseConversionDataset
 from .binary_matrix import BinaryMatrixConfig, BinaryMatrixDataset
 from .caesar_cipher import CaesarCipherConfig, CaesarCipherDataset
 from .count_primes import CountPrimesConfig, CountPrimesDataset
 from .group_anagrams import GroupAnagramsConfig, GroupAnagramsDataset
 from .isomorphic_strings import IsomorphicStringsConfig, IsomorphicStringsDataset
 from .letter_counting import LetterCountingConfig, LetterCountingDataset
@ -69,4 +70,6 @@ __all__ = [
    "BinaryMatrixDataset",
    "ABConfig",
    "ABDataset",
    "CountPrimesConfig",
    "CountPrimesDataset",
 ]
--- a/reasoning_gym/algorithmic/count_primes.py
+++ b/reasoning_gym/algorithmic/count_primes.py
@ -0,0 +1,63 @@
 """Count prime numbers in a given interval.
 Solution obtained with Sieve of Eratosthenes:
 https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes
 """
 import math
 from dataclasses import dataclass
 from random import Random
 from typing import Optional
 from ..factory import ProceduralDataset, register_dataset
 QUESTION_TEMPLATE = """Count how many prime numbers there are between {start} and {end} (inclusive) ?"""
@dataclass
 class CountPrimesConfig:
    """Configuration for Count Primes dataset generation"""
    max_n: int = 10_000  # Upper bound for the interval
    size: int = 500  # Virtual dataset size
    seed: Optional[int] = None
    def validate(self):
        """Validate configuration parameters"""
        assert 1 <= self.max_n, "max_n must be at least 1"
 class CountPrimesDataset(ProceduralDataset):
    """Generates Count Primes exercises with configurable difficulty"""
    def __init__(self, config: CountPrimesConfig):
        super().__init__(config=config, seed=config.seed, size=config.size)
        self.primes = self._get_primes(config.max_n + 1)
    def _get_primes(self, n: int) -> list[bool]:
        if n <= 1:
            return []
        primes = [True] * n
        primes[0] = primes[1] = False
        for i in range(2, int(math.sqrt(n)) + 1):
            if primes[i]:
                for j in range(2 * i, n, i):
                    primes[j] = False
        return primes
    def __getitem__(self, idx: int) -> dict:
        """Generate a single Count Primes question"""
        rng = Random(self.seed + idx)
        start = rng.randint(1, self.config.max_n)
        end = rng.randint(start, self.config.max_n)
        primes = self.primes[start : end + 1]
        answer = sum(primes)
        return {
            "question": QUESTION_TEMPLATE.format(start=start, end=end),
            "answer": str(answer),
            "metadata": {"start": start, "end": end, "primes": primes, "solution": answer},
        }
 register_dataset("count_primes", CountPrimesDataset, CountPrimesConfig)
--- a/reasoning_gym/algorithmic/rotate_matrix.py
+++ b/reasoning_gym/algorithmic/rotate_matrix.py
@ -60,22 +60,16 @@ class RotateMatrixDataset(ProceduralDataset):
        matrix = [numbers[i * n : (i + 1) * n] for i in range(n)]
        return matrix
    def _rot90(self, matrix: list[list[int]]) -> list[list[int]]:
        """quarter clockwise rotation"""
        return [list(row) for row in zip(*matrix[::-1])]
    def _get_rotated(self, matrix: list[list[int]], num_rotations: int) -> list[list[int]]:
        """Rotate the matrix K times by 90 degrees clockwise"""
        num_rotations %= 4
        n = len(matrix)
        output = deepcopy(matrix)
        for _ in range(num_rotations):
-            for l in range(n // 2):
+            output = self._rot90(output)
                for i in range(l, n - 1 - l):
                    (output[l][i], output[i][n - 1 - l], output[n - 1 - l][n - 1 - i], output[n - 1 - i][l]) = (
                        output[n - 1 - i][l],
                        output[l][i],
                        output[i][n - 1 - l],
                        output[n - 1 - l][n - 1 - i],
                    )
        return output
    def _matrix_to_str(self, matrix: list[list[int]]) -> str:
--- a/reasoning_gym/arithmetic/init.py
+++ b/reasoning_gym/arithmetic/init.py
@ -6,6 +6,7 @@ from .basic_arithmetic import BasicArithmeticDataset, BasicArithmeticDatasetConf
 from .calendar_arithmetic import CalendarArithmeticConfig, CalendarArithmeticDataset
 from .chain_sum import ChainSum, ChainSumConfig
 from .count_bits import CountBitsConfig, CountBitsDataset
 from .dice import DiceConfig, DiceDataset
 from .fraction_simplification import FractionSimplificationConfig, FractionSimplificationDataset
 from .gcd import GCDConfig, GCDDataset
 from .gsm_symbolic.gsm_symbolic import GSMSymbolicDataset, GSMSymbolicDatasetConfig
@ -38,4 +39,6 @@ __all__ = [
    "TimeIntervalsDataset",
    "CountBitsConfig",
    "CountBitsDataset",
    "DiceConfig",
    "DiceDataset",
 ]
--- a/reasoning_gym/arithmetic/chain_sum.py
+++ b/reasoning_gym/arithmetic/chain_sum.py
@ -2,6 +2,7 @@ import random
 from dataclasses import dataclass
 from typing import Optional
 from ..coaching import AttributeType, BaseCurriculum, RangeAttributeDefinition
 from ..factory import ProceduralDataset, register_dataset
@ -112,5 +113,36 @@ class ChainSum(ProceduralDataset):
        return expression, result
 class ChainSumCurriculum(BaseCurriculum):
    def __init__(self):
        super().__init__(ChainSumCurriculum.__name__, ChainSumConfig)
        # Define attributes
        self._define_attributes(
            (
                RangeAttributeDefinition(
                    name="num_terms",
                    levels=[2, 3, 4, 5],
                    default_level=0,  # Start with 2 terms
                    description="Maximum number of terms in the expression",
                    attr_type=AttributeType.APPEND,
                    min_value=2,  # Ensure at least 2 terms
                    lower_field_name="min_terms",
                    upper_field_name="max_terms",
                ),
                RangeAttributeDefinition(
                    name="num_digits",
                    levels=[1, 2, 4, 10],
                    default_level=0,  # Start with 1-digit numbers
                    description="Number of digits in each operand",
                    attr_type=AttributeType.APPEND,
                    min_value=1,  # Ensure numbers are at least 1 digit
                    lower_field_name="min_digits",
                    upper_field_name="max_digits",
                ),
            )
        )
 # Register the dataset
 register_dataset("chain_sum", ChainSum, ChainSumConfig)
--- a/reasoning_gym/arithmetic/dice.py
+++ b/reasoning_gym/arithmetic/dice.py
@ -0,0 +1,149 @@
 from dataclasses import dataclass
 from functools import reduce
 from math import gcd
 from random import Random
 from typing import Dict, Optional
 from ..factory import ProceduralDataset, register_dataset
 def compute_probability(dice, target):
    """
    Computes the probability of rolling a total of at least `target`
    when rolling dice specified in the list `dice`. Each element in dice
    is the number of sides on that die. The computation is done via dynamic programming.
    Returns the probability as a fraction (numerator, denominator) and as a float.
    """
    # dp[i][s] = number of ways to get sum s using the first i dice.
    # We use only one dictionary for the current dp state.
    dp = {0: 1}
    for sides in dice:
        new_dp = {}
        for current_sum, count in dp.items():
            # Each die gives a number from 1 to sides.
            for face in range(1, sides + 1):
                new_sum = current_sum + face
                new_dp[new_sum] = new_dp.get(new_sum, 0) + count
        dp = new_dp
    total_outcomes = reduce(lambda a, b: a * b, dice, 1)
    ways = sum(count for s, count in dp.items() if s >= target)
    # Simplify the fraction (ways / total_outcomes)
    def simplify(n, d):
        common = gcd(n, d)
        return n // common, d // common
    frac = simplify(ways, total_outcomes)
    return frac, ways / total_outcomes
 def generate_puzzle(num_dice, max_dice_size, rng):
    """
    Generates a puzzle:
      - It forces one die to have max_dice_size.
      - The other (num_dice-1) dice are chosen randomly between 2 and max_dice_size-1.
      - The dice are then shuffled.
      - The target total is chosen roughly in the middle (but you can adjust the method).
    It then computes the probability of rolling a total at least the target.
    Finally, it prints out the puzzle statement and the answer.
    """
    # Guarantee one die is the maximum.
    dice = [max_dice_size]
    for _ in range(num_dice - 1):
        # Choose a die size randomly from 2 up to max_dice_size-1.
        # (If max_dice_size == 2 then all dice are 2-sided.)
        if max_dice_size > 2:
            die = rng.randint(2, max_dice_size - 1)
        else:
            die = 2
        dice.append(die)
    # Optionally, sort dice in descending order (as is common in puzzles)
    dice.sort(reverse=True)
    # Compute minimum and maximum possible totals.
    min_total = num_dice  # each die gives at least 1
    max_total = sum(dice)
    # Choose a target total. For an interesting puzzle,
    # we choose a target somewhere in the middle third of the range.
    low_target = min_total + (max_total - min_total) // 3
    high_target = min_total + 2 * (max_total - min_total) // 3
    target = rng.randint(low_target, high_target)
    # Compute probability.
    (num, den), prob = compute_probability(dice, target)
    # Create a string representing the dice, e.g., "1d20, 1d17, 1d6" etc.
    dice_str = ", ".join(f"1d{s}" for s in dice)
    # Return the puzzle.
    return {"dice_str": dice_str, "target": target, "num": num, "den": den}
@dataclass
 class DiceConfig:
    """Configuration for dice puzzle generation"""
    num_dice: int = 4
    max_dice_size: int = 20
    seed: Optional[int] = None
    size: int = 500
    def validate(self):
        """Validate configuration parameters"""
        assert self.num_dice >= 1, "num_dice must be gte 1"
        assert self.max_dice_size >= 2, "max_dice_size must be gte 2"
 class DiceDataset(ProceduralDataset):
    """Generates Dice-based puzzles with configurable parameters"""
    def __init__(self, config: DiceConfig):
        super().__init__(config=config, seed=config.seed, size=config.size)
    def __getitem__(self, idx: int) -> dict:
        """Generate a single Dice task
        Returns:
            dict with keys:
                - question: str, the task description
                - answer: str, a solution string
                - metadata: dict with generation parameters
        """
        rng = Random(self.seed + idx)
        puzzle = generate_puzzle(self.config.num_dice, self.config.max_dice_size, rng)
        puzzle_str = f"I have these dice: {puzzle['dice_str']}. What are the odds of rolling {puzzle['target']} or higher? (Assume that all dice are rolled at once, and that '1d6' represents one roll of a 6-sided dice.) Please respond with a reduced fraction representing the probability [ex., 1/60]."
        answer_str = f"{puzzle['num']}/{puzzle['den']}"
        return {
            "question": puzzle_str,
            "answer": answer_str,
            "metadata": {},
        }
    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
        """Determine if the solution provided solves the Dice task.
        The function awards 1.0 for a correct answer.
        Args:
            answer (Optional[str]): The user's answer.
            entry (Dict[str, any]): The original dataset entry containing the correct answer.
        Returns:
            float: The computed score between 0.0 and 1.0.
        """
        if answer == None:
            return 0.0
        if answer.lower().replace("\n", "") != entry["answer"].lower().replace("\n", ""):
            return 0.01
        else:
            return 1.0  # Yay
 register_dataset("dice", DiceDataset, DiceConfig)
--- a/reasoning_gym/coaching/init.py
+++ b/reasoning_gym/coaching/init.py
@ -0,0 +1,14 @@
 from .attributes import AttributeDefinition, AttributeType, RangeAttributeDefinition
 from .base_curriculum import BaseCurriculum
 from .coach import Coach, GroupedScores, ScoreBoard, ScoreStats
 __all__ = [
    "AttributeType",
    "AttributeDefinition",
    "RangeAttributeDefinition",
    "BaseCurriculum",
    "Coach",
    "ScoreBoard",
    "GroupedScores",
    "ScoreStats",
 ]
--- a/reasoning_gym/coaching/attributes.py
+++ b/reasoning_gym/coaching/attributes.py
@ -0,0 +1,73 @@
 from collections import abc
 from dataclasses import dataclass
 from enum import StrEnum
 from typing import Any, Optional
 class AttributeType(StrEnum):
    """Defines how attribute levels should be interpreted"""
    STATIC = "static"  # Each level is independent
    UBOUND = "ubound"  # Each level is an upper bound
    APPEND = "append"  # Each level includes all previous levels
@dataclass(kw_only=True)
 class AttributeDefinition:
    name: str
    levels: list
    default_level: int
    description: Optional[str] = None
    attr_type: AttributeType = AttributeType.STATIC  # Default to static
    min_value: Optional[int | float] = None  # Minimum value for numeric attributes
    def validate_level(self, level: int, curriculum: str) -> None:
        """
        Validate that a level is valid for an attribute.
        Args:
            level: Level to validate
            curriculum: Name of the curriculum
        Raises:
            ValueError: If level is invalid
        """
        # TODO: if > set as [-1], if <0 set as [0]
        if not 0 <= level < len(self.levels):
            raise ValueError(
                f"Invalid level: {level} for attribute '{curriculum}.{self.name}'. "
                f"Must be between 0 and {len(self.levels)-1}"
            )
    def get_level_value(self, level: int, curriculum: str) -> Any:
        """
        Get the value for an attribute at a specific level based on its type.
        Args:
            attr: The attribute definition
            level: Level to get value for
        Returns:
            Value for the attribute based on its level and type
        """
        if self.attr_type == AttributeType.STATIC:
            return self.levels[level]
        elif self.attr_type == AttributeType.UBOUND:
            return self.levels[level]
        elif self.attr_type == AttributeType.APPEND:
            return self.levels[: level + 1]
        raise ValueError(f"Unknown attribute type: {self.attr_type} for attribute '{curriculum}.{self.name}'")
@dataclass(kw_only=True)
 class ScalarAttributeDefinition(AttributeDefinition):
    field_name: str
@dataclass(kw_only=True)
 class RangeAttributeDefinition(AttributeDefinition):
    lower_field_name: str
    upper_field_name: str
    def get_level_value(self, level: int, curriculum: str) -> Any:
        v = super().get_level_value(level, curriculum)
        if not isinstance(v, abc.Iterable):
            return [v]
        return v
--- a/reasoning_gym/coaching/base_curriculum.py
+++ b/reasoning_gym/coaching/base_curriculum.py
@ -0,0 +1,108 @@
 from typing import Any, Iterable, Optional
 from ..factory import ConfigT
 from .attributes import AttributeDefinition, RangeAttributeDefinition, ScalarAttributeDefinition
 class BaseCurriculum:
    def __init__(self, name: str, config_cls: ConfigT):
        self.name = name
        self._config_cls = config_cls
        self._attributes: dict[str, AttributeDefinition] = {}
        self._current_levels: dict[str, int] = {}
    def generate_configuration(self, defaults: Optional[dict[str, any]] = None) -> ConfigT:
        config_args = defaults.copy() if defaults is not None else {}
        for attr in self._attributes.values():
            if isinstance(attr, RangeAttributeDefinition):
                vals = self.get_attr_value(attr.name)
                config_args[attr.lower_field_name] = min(vals)
                config_args[attr.upper_field_name] = max(vals)
            elif isinstance(attr, ScalarAttributeDefinition):
                val = self.get_attr_value(attr.name)
                config_args[attr.field_name] = val
        print(config_args)
        return self._config_cls(**config_args)
    @property
    def attributes(self) -> dict[str, AttributeDefinition]:
        """Get the curriculum's attributes"""
        return self._attributes
    def get_attribute(self, attr_name: str) -> AttributeDefinition:
        if attr_name not in self._attributes:
            raise KeyError(f"Attribute '{self.name}.{attr_name}' does not exist")
        return self._attributes[attr_name]
    def _define_attributes(self, attrs: Iterable[AttributeDefinition]) -> None:
        for attr in attrs:
            if attr.name in self.attributes:
                raise RuntimeError(f"Attribute with name {attr.name} is already defined.")
            self.attributes[attr.name] = attr
    def get_attr_level(self, attr_name: str) -> int:
        """
        Get the current level for an attribute.
        Args:
            attr_name: Name of the attribute
        Returns:
            Current level index for the attribute
        """
        attr = self.get_attribute(attr_name)
        return self._current_levels.get(attr_name, attr.default_level)
    def get_attr_value(self, attr_name: str) -> Any:
        """
        Get the current value for an attribute based on its level.
        Args:
            attr_name: Name of the attribute
        Returns:
            Current value for the attribute based on its level and type
        """
        attr = self.get_attribute(attr_name)
        level = self.get_attr_level(attr_name)
        return attr.get_level_value(level, curriculum=self.name)
    def set_attr_level(self, attr_name: str, level: int) -> None:
        """
        Set the level for an attribute.
        Args:
            attr_name: Name of the attribute
            level: New level index
        """
        attr = self.get_attribute(attr_name)
        attr.validate_level(level, curriculum=self.name)
        self._current_levels[attr_name] = level
    def increment_attr_level(self, attr_name: str) -> bool:
        """
        Increment the level of an attribute if possible.
        Args:
            attr_name: Name of the attribute to increment
        Returns:
            bool: True if level was incremented, False if already at max level
        Raises:
            KeyError: If attribute doesn't exist
        """
        attr = self.get_attribute(attr_name)
        current_level = self.get_attr_level(attr_name)
        if current_level < len(attr.levels) - 1:
            self.set_attr_level(attr_name, current_level + 1)
            return True
        return False
    def decrement_attr_level(self, attr_name: str) -> bool:
        """
        Decrement the level of an attribute if possible.
        Args:
            attr_name: Name of the attribute to decrement
        Returns:
            bool: True if level was decremented, False if already at min level
        Raises:
            KeyError: If attribute doesn't exist
        """
        current_level = self.get_attr_level(attr_name)
        if current_level > 0:
            self.set_attr_level(attr_name, current_level - 1)
            return True
        return False
--- a/reasoning_gym/coaching/coach.py
+++ b/reasoning_gym/coaching/coach.py
@ -8,7 +8,7 @@ from pathlib import Path
 from statistics import mean, stdev
 from typing import Any, Dict, List, Optional, Tuple, Union
-from .dataset import ProceduralDataset
+from ..dataset import ProceduralDataset
@dataclass
--- a/reasoning_gym/cognition/init.py
+++ b/reasoning_gym/cognition/init.py
@ -5,6 +5,7 @@ Cognition tasks for training reasoning capabilities.
 from .color_cube_rotation import ColorCubeRotationConfig, ColorCubeRotationDataset
 from .figlet_fonts import FigletFontConfig, FigletFontDataset
 from .number_sequences import NumberSequenceConfig, NumberSequenceDataset
 from .rectangle_count import RectangleCountConfig, RectangleCountDataset
 from .rubiks_cube import RubiksCubeConfig, RubiksCubeDataset
 __all__ = [
@ -16,4 +17,6 @@ __all__ = [
    "NumberSequenceDataset",
    "RubiksCubeConfig",
    "RubiksCubeDataset",
    "RectangleCountConfig",
    "RectangleCountDataset",
 ]
--- a/reasoning_gym/cognition/rectangle_count.py
+++ b/reasoning_gym/cognition/rectangle_count.py
@ -0,0 +1,135 @@
 from dataclasses import dataclass
 from random import Random
 from typing import Dict, Optional
 from ..factory import ProceduralDataset, register_dataset
 def draw_rectangles_with_overlap(n, width, height, rng):
    # Create a grid that holds a count of how many times a cell is drawn.
    grid = [[0 for _ in range(width)] for _ in range(height)]
    rectangles = []
    max_attempts = 100000  # Prevent infinite loops in case of a crowded grid
    attempts = 0
    while len(rectangles) < n and attempts < max_attempts:
        attempts += 1
        # Ensure minimum width and height of 3.
        # For a rectangle to be at least 3 cells wide, right must be at least left + 2.
        # Similarly, bottom must be at least top + 2.
        left = rng.randint(0, width - 3)
        right = rng.randint(left + 2, width - 1)
        top = rng.randint(0, height - 3)
        bottom = rng.randint(top + 2, height - 1)
        # Prepare a list of all the cells that would be updated.
        cells_to_update = []
        # Top edge:
        for col in range(left, right + 1):
            cells_to_update.append((top, col))
        # Bottom edge:
        for col in range(left, right + 1):
            cells_to_update.append((bottom, col))
        # Left edge (excluding corners already drawn):
        for row in range(top + 1, bottom):
            cells_to_update.append((row, left))
        # Right edge (excluding corners already drawn):
        for row in range(top + 1, bottom):
            cells_to_update.append((row, right))
        # Check if drawing this rectangle would cause any cell to exceed a count of 2.
        conflict = False
        for r, c in cells_to_update:
            if grid[r][c] >= 2:
                conflict = True
                break
        if conflict:
            continue  # Skip this rectangle candidate
        # No conflict: update the grid counts.
        for r, c in cells_to_update:
            grid[r][c] += 1
        # Save the rectangle (stored as (left, right, top, bottom)).
        rectangles.append((left, right, top, bottom))
    if len(rectangles) < n:
        print(f"Only placed {len(rectangles)} rectangles after {attempts} attempts.")
    # Print the grid.
    # Use ' ' for an untouched cell, '#' for a single hit, and '█' for exactly two hits.
    lines = ""
    for row in grid:
        line = "".join(" " if count == 0 else ("#" if count == 1 else "█") for count in row)
        lines = lines + line + "\n"
    return lines, len(rectangles)
@dataclass
 class RectangleCountConfig:
    """Configuration for RectangleCount puzzle generation"""
    max_rectangles: int = 10
    width: int = 80
    height: int = 80
    seed: Optional[int] = None
    size: int = 500
    def validate(self):
        """Validate configuration parameters"""
        assert self.width >= 10, "width must be gte 10"
        assert self.height >= 10, "height must be gte 10"
 class RectangleCountDataset(ProceduralDataset):
    """Generates [RectangleCount Puzzles](https://en.wikipedia.org/wiki/RectangleCount_Puzzle) with configurable parameters"""
    def __init__(self, config: RectangleCountConfig):
        super().__init__(config=config, seed=config.seed, size=config.size)
    def __getitem__(self, idx: int) -> dict:
        """Generate a single RectangleCount task
        Returns:
            dict with keys:
                - question: str, the task description
                - answer: str, a solution string
                - metadata: dict with generation parameters
        """
        rng = Random(self.seed + idx)
        target = rng.randint(1, self.config.max_rectangles)
        puzzle, answer = draw_rectangles_with_overlap(target, self.config.width, self.config.height, rng)
        puzz = f"How many rectangles do you see? Single rectangles are outlined with a '#', overlapping rectangles (max 2) are shown with '█'. \n\n {puzzle}"
        return {
            "question": puzz,
            "answer": str(answer),
            "metadata": {},
        }
    def score_answer(self, answer: Optional[str], entry: Dict[str, any]) -> float:
        """Determine if the solution provided solves the RectangleCount task.
        The function awards 1.0 for a correct answer.
        Args:
            answer (Optional[str]): The user's answer.
            entry (Dict[str, any]): The original dataset entry containing the correct answer.
        Returns:
            float: The computed score between 0.0 and 1.0.
        """
        if answer == None:
            return 0.0
        if answer.lower().replace("\n", "") != entry["answer"].lower().replace("\n", ""):
            return 0.01
        else:
            return 1.0  # Yay
 register_dataset("rectangle_count", RectangleCountDataset, RectangleCountConfig)
--- a/tests/test_chain_sum.py
+++ b/tests/test_chain_sum.py
@ -1,6 +1,7 @@
 import pytest
 from reasoning_gym.arithmetic import ChainSum, ChainSumConfig
 from reasoning_gym.arithmetic.chain_sum import ChainSumCurriculum
 def test_chain_sum_config_validation():
@ -127,3 +128,30 @@ def test_chain_sum_iteration():
    first_items = list(dataset)
    second_items = list(dataset)
    assert first_items == second_items, "Multiple iterations should yield same items"
 def test_chain_sum_curriculum():
    curriculum = ChainSumCurriculum()
    base_value = {"size": 150, "seed": 1}
    base_cfg: ChainSumConfig = curriculum.generate_configuration(base_value)
    assert base_cfg.seed == 1
    assert base_cfg.size == 150
    assert base_cfg.min_digits == 1 and base_cfg.max_digits == 1
    assert base_cfg.min_terms == 2 and base_cfg.max_terms == 2
    # test incrementing attribute levels for num_terms & num_digits attributes
    curriculum.increment_attr_level("num_terms")
    curriculum.increment_attr_level("num_digits")
    increased_cfg = curriculum.generate_configuration(base_value)
    assert increased_cfg.min_digits == 1 and increased_cfg.max_digits == 2
    assert increased_cfg.min_terms == 2 and increased_cfg.max_terms == 3
    # test decrementing attribute level for num_digits again
    curriculum.decrement_attr_level("num_digits")
    partially_decreased_cfg = curriculum.generate_configuration(base_value)
    assert partially_decreased_cfg.min_digits == 1 and partially_decreased_cfg.max_digits == 1
    assert partially_decreased_cfg.min_terms == 2 and partially_decreased_cfg.max_terms == 3
--- a/tests/test_count_primes.py
+++ b/tests/test_count_primes.py
@ -0,0 +1,88 @@
 """Tests for Count Primes questions generation"""
 import pytest
 from reasoning_gym.algorithmic.count_primes import CountPrimesConfig, CountPrimesDataset
 def test_count_primes_config_validation():
    """Test that invalid configs raise appropriate errors"""
    with pytest.raises(AssertionError):
        config = CountPrimesConfig(max_n=-1)  # Negative not allowed
        config.validate()
    with pytest.raises(AssertionError):
        config = CountPrimesConfig(max_n=0)  # Zero not allowed
        config.validate()
 def test_count_primes_dataset_deterministic():
    """Test that dataset generates same items with same seed"""
    config = CountPrimesConfig(seed=42, size=10)
    dataset1 = CountPrimesDataset(config)
    dataset2 = CountPrimesDataset(config)
    for i in range(len(dataset1)):
        assert dataset1[i] == dataset2[i]
 def test_count_primes_dataset_items():
    """Test basic properties of generated items"""
    config = CountPrimesConfig(max_n=10, size=10, seed=42)
    dataset = CountPrimesDataset(config)
    for i in range(len(dataset)):
        item = dataset[i]
        # Check item structure
        assert isinstance(item, dict)
        assert "question" in item
        assert "answer" in item
        assert "metadata" in item
        # Check metadata
        assert "start" in item["metadata"]
        assert "end" in item["metadata"]
        assert "primes" in item["metadata"]
        assert "solution" in item["metadata"]
        start = item["metadata"]["start"]
        end = item["metadata"]["end"]
        primes = item["metadata"]["primes"]
        assert start <= end
        assert len(primes) <= end - start + 1
 def test_count_primes_dataset_iteration():
    """Test that iteration respects dataset size"""
    config = CountPrimesConfig(size=5, seed=42)
    dataset = CountPrimesDataset(config)
    items = list(dataset)
    assert len(items) == config.size
    # Test multiple iterations yield same items
    assert items == list(dataset)
 def test_count_primes_answer():
    """Test the _get_primes method"""
    config = CountPrimesConfig(seed=42)
    dataset = CountPrimesDataset(config)
    # Base cases
    assert dataset._get_primes(n=0) == []
    assert dataset._get_primes(n=1) == []
    assert dataset._get_primes(n=2) == [False, False]
    # Test primes up to 10
    primes = dataset._get_primes(n=11)
    assert primes[2] == True
    assert primes[3] == True
    assert primes[4] == False
    assert primes[5] == True
    assert primes[6] == False
    assert primes[7] == True
    assert primes[8] == False
    assert primes[9] == False
    assert primes[10] == False
--- a/tests/test_dice.py
+++ b/tests/test_dice.py
@ -0,0 +1,35 @@
 import pytest
 from reasoning_gym.arithmetic.dice import DiceConfig, DiceDataset
 def test_dice():
    """Test basic properties and solution of generated items"""
    config = DiceConfig(seed=42, size=50, num_dice=8, max_dice_size=24)
    dataset = DiceDataset(config)
    for item in dataset:
        assert isinstance(item, dict)
        assert "question" in item
        assert "answer" in item
        assert "metadata" in item
        # Test the scoring
        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
        assert dataset.score_answer(answer=None, entry=item) == 0.0
    # Easy
    config = DiceConfig(seed=42, size=1, num_dice=1, max_dice_size=2)
    dataset = DiceDataset(config)
    for item in dataset:
        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
        assert dataset.score_answer(answer=None, entry=item) == 0.0
    # Hard
    config = DiceConfig(seed=42, size=1, num_dice=40, max_dice_size=40)
    dataset = DiceDataset(config)
    for item in dataset:
        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
        assert dataset.score_answer(answer=None, entry=item) == 0.0
--- a/tests/test_rectangle_count.py
+++ b/tests/test_rectangle_count.py
@ -0,0 +1,19 @@
 import pytest
 from reasoning_gym.cognition.rectangle_count import RectangleCountConfig, RectangleCountDataset
 def test_dice():
    """Test basic properties and solution of generated items"""
    config = RectangleCountConfig(seed=42, size=50, max_rectangles=15, width=40, height=40)
    dataset = RectangleCountDataset(config)
    for item in dataset:
        assert isinstance(item, dict)
        assert "question" in item
        assert "answer" in item
        assert "metadata" in item
        # Test the scoring
        assert dataset.score_answer(answer=item["answer"], entry=item) == 1.0
        assert dataset.score_answer(answer=None, entry=item) == 0.0