Merge remote-tracking branch 'origin/main' into feat/curr-adj

This commit is contained in:
joesharratt1229 2025-04-01 16:17:31 +00:00
commit 4b9c155cef
9 changed files with 158 additions and 9 deletions

5
.gitignore vendored
View file

@ -45,3 +45,8 @@ htmlcov/
# Jupyter Notebook # Jupyter Notebook
.ipynb_checkpoints/ .ipynb_checkpoints/
.virtual_documents/ .virtual_documents/
# logs
wandb/
outputs/
*.log

View file

@ -2,7 +2,7 @@
**Reasoning Gym** is a community-created Python library of procedural dataset generators and algorithmically verifiable reasoning environments for training reasoning models with reinforcement learning (RL). The goal is to generate virtually infinite training data with adjustable complexity. **Reasoning Gym** is a community-created Python library of procedural dataset generators and algorithmically verifiable reasoning environments for training reasoning models with reinforcement learning (RL). The goal is to generate virtually infinite training data with adjustable complexity.
It currently provides **more than 80** tasks over many domains, including but not limited to _algebra_, _arithmetic_, _computation_, _cognition_, _geometry_, _graph theory_, _logic_, and many common _games_. It currently provides **more than 100** tasks over many domains, including but not limited to _algebra_, _arithmetic_, _computation_, _cognition_, _geometry_, _graph theory_, _logic_, and many common _games_.
Some tasks have a single correct answer, while others, such as [Rubiks Cube](https://en.wikipedia.org/wiki/Rubik%27s_Cube) and [Countdown](<https://en.wikipedia.org/wiki/Countdown_(game_show)#Numbers_Round>), have many correct solutions. To support this, we provide a standard interface for procedurally verifying solutions. Some tasks have a single correct answer, while others, such as [Rubiks Cube](https://en.wikipedia.org/wiki/Rubik%27s_Cube) and [Countdown](<https://en.wikipedia.org/wiki/Countdown_(game_show)#Numbers_Round>), have many correct solutions. To support this, we provide a standard interface for procedurally verifying solutions.
@ -24,7 +24,7 @@ _Note that this project is currently under active development, and the version p
## 🛠️ Development ## 🛠️ Development
For development setup, see [CONTRIBUTING.md](CONTRIBUTING.md#delevloper-setup). For development setup, see [CONTRIBUTING.md](CONTRIBUTING.md#development-setup).
## ✨ Example Usage ## ✨ Example Usage

View file

@ -385,7 +385,12 @@ def create_performance_heatmap(summaries: Dict[str, Dict[str, Any]], categories:
for category, datasets in sorted(categories.items()): for category, datasets in sorted(categories.items()):
all_datasets.extend(sorted(datasets)) all_datasets.extend(sorted(datasets))
models = list(summaries.keys()) # Sort models by overall performance
overall_scores = {}
for model_name, summary in summaries.items():
scores = list(summary["dataset_best_scores"].values())
overall_scores[model_name] = np.mean(scores)
models = [item[0] for item in sorted(overall_scores.items(), key=lambda x: x[1], reverse=True)]
# Create score matrix # Create score matrix
score_matrix = np.zeros((len(models), len(all_datasets))) score_matrix = np.zeros((len(models), len(all_datasets)))

View file

@ -1,10 +1,11 @@
from .arc_1d import Arc1DConfig, Arc1DDataset from .arc_1d import Arc1DConfig, Arc1DCurriculum, Arc1DDataset
from .arc_agi import ArcAgiConfig, ArcAgiDataset from .arc_agi import ArcAgiConfig, ArcAgiDataset
from .rearc import ReArcConfig, ReArcCurriculum, ReArcDataset from .rearc import ReArcConfig, ReArcCurriculum, ReArcDataset
__all__ = [ __all__ = [
"Arc1DConfig", "Arc1DConfig",
"Arc1DDataset", "Arc1DDataset",
"Arc1DCurriculum",
"ArcAgiConfig", "ArcAgiConfig",
"ArcAgiDataset", "ArcAgiDataset",
"ReArcDataset", "ReArcDataset",

View file

@ -2,6 +2,7 @@ from dataclasses import dataclass
from random import Random from random import Random
from typing import Optional from typing import Optional
from ..coaching import BaseCurriculum, RangeAttributeDefinition
from ..dataset import ProceduralDataset from ..dataset import ProceduralDataset
from ..factory import register_dataset from ..factory import register_dataset
@ -108,9 +109,31 @@ class Arc1DDataset(ProceduralDataset):
"size": size, "size": size,
"train_examples": train_examples, "train_examples": train_examples,
"test_example": test_example, "test_example": test_example,
"difficulty": {
"size": (self.config.min_size, self.config.max_size),
},
}, },
} }
class Arc1DCurriculum(BaseCurriculum):
"""Curriculum for ARC 1D tasks"""
def __init__(self):
super().__init__(Arc1DCurriculum.__name__, Arc1DConfig)
# Define attributes
self._define_attributes(
RangeAttributeDefinition(
name="size",
levels=[10, 25, 50, 100],
lower_field_name="min_size",
upper_field_name="max_size",
description="Grid size",
ensure_interval=True,
)
)
# Register the dataset # Register the dataset
register_dataset(DATASET_NAME, Arc1DDataset, Arc1DConfig) register_dataset(DATASET_NAME, Arc1DDataset, Arc1DConfig, Arc1DCurriculum)

View file

@ -4,7 +4,7 @@ Logic tasks for training reasoning capabilities.
from .aiw import AliceInWonderlandConfig, AliceInWonderlandCurriculum, AliceInWonderlandDataset from .aiw import AliceInWonderlandConfig, AliceInWonderlandCurriculum, AliceInWonderlandDataset
from .circuit_logic import CircuitLogicConfig, CircuitLogicCurriculum, CircuitLogicDataset from .circuit_logic import CircuitLogicConfig, CircuitLogicCurriculum, CircuitLogicDataset
from .knights_knaves import KnightsKnavesConfig, KnightsKnavesDataset from .knights_knaves import KnightsKnavesConfig, KnightsKnavesCurriculum, KnightsKnavesDataset
from .propositional_logic import PropositionalLogicConfig, PropositionalLogicCurriculum, PropositionalLogicDataset from .propositional_logic import PropositionalLogicConfig, PropositionalLogicCurriculum, PropositionalLogicDataset
from .self_reference import SelfReferenceConfig, SelfReferenceCurriculum, SelfReferenceDataset from .self_reference import SelfReferenceConfig, SelfReferenceCurriculum, SelfReferenceDataset
from .syllogisms import SyllogismConfig, SyllogismDataset from .syllogisms import SyllogismConfig, SyllogismDataset
@ -31,4 +31,5 @@ __all__ = [
"CircuitLogicCurriculum", "CircuitLogicCurriculum",
"KnightsKnavesConfig", "KnightsKnavesConfig",
"KnightsKnavesDataset", "KnightsKnavesDataset",
"KnightsKnavesCurriculum",
] ]

View file

@ -8,6 +8,8 @@ import numpy as np
from reasoning_gym.factory import ProceduralDataset, register_dataset from reasoning_gym.factory import ProceduralDataset, register_dataset
from ..coaching import BaseCurriculum, ScalarAttributeDefinition
DATASET_NAME = "knights_knaves" DATASET_NAME = "knights_knaves"
COMMON_NAMES = [ COMMON_NAMES = [
@ -462,6 +464,11 @@ class KnightsKnavesDataset(ProceduralDataset):
"solution": problem["solution"], "solution": problem["solution"],
"names": formatted["names"], "names": formatted["names"],
"knight_knave_terms": formatted["knight_knave"], "knight_knave_terms": formatted["knight_knave"],
"difficulty": {
"n_people": self.config.n_people,
"depth_constraint": self.config.depth_constraint,
"width_constraint": self.config.width_constraint,
},
} }
return {"question": question, "answer": answer, "metadata": metadata} return {"question": question, "answer": answer, "metadata": metadata}
@ -515,4 +522,30 @@ class KnightsKnavesDataset(ProceduralDataset):
return 0.0 return 0.0
register_dataset(DATASET_NAME, KnightsKnavesDataset, KnightsKnavesConfig) class KnightsKnavesCurriculum(BaseCurriculum):
def __init__(self):
super().__init__(KnightsKnavesCurriculum.__name__, KnightsKnavesConfig)
self._define_attributes(
ScalarAttributeDefinition(
name="n_people",
levels=[2, 3, 4, 5],
description="Number of people in the problem",
field_name="n_people",
),
ScalarAttributeDefinition(
name="depth_constraint",
levels=[2, 3, 4, 5],
description="Depth of the problem",
field_name="depth_constraint",
),
ScalarAttributeDefinition(
name="width_constraint",
levels=[2, 3, 4, 5],
description="Width of the problem",
field_name="width_constraint",
),
)
register_dataset(DATASET_NAME, KnightsKnavesDataset, KnightsKnavesConfig, KnightsKnavesCurriculum)

View file

@ -2,7 +2,7 @@ from random import Random
import pytest import pytest
from reasoning_gym.arc import Arc1DConfig, Arc1DDataset from reasoning_gym.arc import Arc1DConfig, Arc1DCurriculum, Arc1DDataset
def test_arc_1d_config_validation(): def test_arc_1d_config_validation():
@ -41,6 +41,7 @@ def test_arc_1d_items():
assert "question" in item assert "question" in item
assert "answer" in item assert "answer" in item
assert "metadata" in item assert "metadata" in item
assert "difficulty" in item["metadata"]
# Check metadata contents # Check metadata contents
metadata = item["metadata"] metadata = item["metadata"]
@ -142,3 +143,44 @@ def test_arc_1d_generate_all_tasks():
break break
assert i < 20 assert i < 20
print(task_name, j, i, x) print(task_name, j, i, x)
def test_arc_1d_curriculum():
"""Test the curriculum for complex arithmetic."""
curriculum = Arc1DCurriculum()
base_value = {"size": 150, "seed": 1}
base_cfg: Arc1DCurriculum = curriculum.generate_configuration(base_value)
assert base_cfg.seed == 1
assert base_cfg.size == 150
assert base_cfg.min_size == 10
assert base_cfg.max_size == 25
# Test and validate increase in levels
curriculum.increment_attr_level("size")
increased_cfg: Arc1DCurriculum = curriculum.generate_configuration(base_value)
assert increased_cfg.min_size == 10
assert increased_cfg.max_size == 50
# Test and validate decrease in levels
curriculum.decrement_attr_level("size")
decreased_cfg: Arc1DCurriculum = curriculum.generate_configuration(base_value)
assert decreased_cfg.min_size == 10
assert decreased_cfg.max_size == 25
# Test upper bound boundary condition
for _ in range(10):
curriculum.increment_attr_level("size")
upper_bound_cfg: Arc1DCurriculum = curriculum.generate_configuration(base_value)
assert upper_bound_cfg.min_size == 10
assert upper_bound_cfg.max_size == 100
# Test lower bound boundary condition
for _ in range(10):
curriculum.decrement_attr_level("size")
lower_bound_cfg: Arc1DCurriculum = curriculum.generate_configuration(base_value)
assert lower_bound_cfg.min_size == 10
assert lower_bound_cfg.max_size == 25

View file

@ -1,6 +1,6 @@
import pytest import pytest
from reasoning_gym.logic.knights_knaves import KnightsKnavesConfig, KnightsKnavesDataset from reasoning_gym.logic.knights_knaves import KnightsKnavesConfig, KnightsKnavesCurriculum, KnightsKnavesDataset
def test_config_validation(): def test_config_validation():
@ -234,3 +234,42 @@ def test_depth_constraint_specific_problem():
solutions = KnightsKnavesDataset.find_solution(test_statements) solutions = KnightsKnavesDataset.find_solution(test_statements)
assert len(solutions) == 1, "Should have exactly one solution" assert len(solutions) == 1, "Should have exactly one solution"
assert solutions[0] == (True, False, False) assert solutions[0] == (True, False, False)
def test_curriculum():
curriculum = KnightsKnavesCurriculum()
assert len(curriculum.attributes) == 3
base_value = {"size": 150, "seed": 1}
base_cfg = curriculum.generate_configuration(base_value)
assert base_cfg.seed == 1
assert base_cfg.size == 150
assert base_cfg.n_people == 2
assert base_cfg.depth_constraint == 2
# test incrementing attribute levels
curriculum.increment_attr_level("n_people")
curriculum.increment_attr_level("depth_constraint")
curriculum.increment_attr_level("width_constraint")
increased_cfg = curriculum.generate_configuration(base_value)
assert increased_cfg.n_people == 3
assert increased_cfg.depth_constraint == 3
assert increased_cfg.width_constraint == 3
# test decrementing attribute level
curriculum.decrement_attr_level("n_people")
partially_decreased_cfg = curriculum.generate_configuration(base_value)
assert partially_decreased_cfg.n_people == 2
assert partially_decreased_cfg.depth_constraint == 3
assert partially_decreased_cfg.width_constraint == 3
curriculum.increment_attr_level("n_people")
curriculum.increment_attr_level("depth_constraint")
curriculum.increment_attr_level("width_constraint")
increased_cfg = curriculum.generate_configuration(base_value)
assert increased_cfg.n_people == 3
assert increased_cfg.depth_constraint == 4
assert increased_cfg.width_constraint == 4