mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-28 17:29:39 +00:00
Merge remote-tracking branch 'origin/main' into feat/curr-adj
This commit is contained in:
commit
4b9c155cef
9 changed files with 158 additions and 9 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
|
@ -45,3 +45,8 @@ htmlcov/
|
|||
# Jupyter Notebook
|
||||
.ipynb_checkpoints/
|
||||
.virtual_documents/
|
||||
|
||||
# logs
|
||||
wandb/
|
||||
outputs/
|
||||
*.log
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
**Reasoning Gym** is a community-created Python library of procedural dataset generators and algorithmically verifiable reasoning environments for training reasoning models with reinforcement learning (RL). The goal is to generate virtually infinite training data with adjustable complexity.
|
||||
|
||||
It currently provides **more than 80** tasks over many domains, including but not limited to _algebra_, _arithmetic_, _computation_, _cognition_, _geometry_, _graph theory_, _logic_, and many common _games_.
|
||||
It currently provides **more than 100** tasks over many domains, including but not limited to _algebra_, _arithmetic_, _computation_, _cognition_, _geometry_, _graph theory_, _logic_, and many common _games_.
|
||||
|
||||
Some tasks have a single correct answer, while others, such as [Rubik‘s Cube](https://en.wikipedia.org/wiki/Rubik%27s_Cube) and [Countdown](<https://en.wikipedia.org/wiki/Countdown_(game_show)#Numbers_Round>), have many correct solutions. To support this, we provide a standard interface for procedurally verifying solutions.
|
||||
|
||||
|
|
@ -24,7 +24,7 @@ _Note that this project is currently under active development, and the version p
|
|||
|
||||
## 🛠️ Development
|
||||
|
||||
For development setup, see [CONTRIBUTING.md](CONTRIBUTING.md#delevloper-setup).
|
||||
For development setup, see [CONTRIBUTING.md](CONTRIBUTING.md#development-setup).
|
||||
|
||||
## ✨ Example Usage
|
||||
|
||||
|
|
|
|||
|
|
@ -385,7 +385,12 @@ def create_performance_heatmap(summaries: Dict[str, Dict[str, Any]], categories:
|
|||
for category, datasets in sorted(categories.items()):
|
||||
all_datasets.extend(sorted(datasets))
|
||||
|
||||
models = list(summaries.keys())
|
||||
# Sort models by overall performance
|
||||
overall_scores = {}
|
||||
for model_name, summary in summaries.items():
|
||||
scores = list(summary["dataset_best_scores"].values())
|
||||
overall_scores[model_name] = np.mean(scores)
|
||||
models = [item[0] for item in sorted(overall_scores.items(), key=lambda x: x[1], reverse=True)]
|
||||
|
||||
# Create score matrix
|
||||
score_matrix = np.zeros((len(models), len(all_datasets)))
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
from .arc_1d import Arc1DConfig, Arc1DDataset
|
||||
from .arc_1d import Arc1DConfig, Arc1DCurriculum, Arc1DDataset
|
||||
from .arc_agi import ArcAgiConfig, ArcAgiDataset
|
||||
from .rearc import ReArcConfig, ReArcCurriculum, ReArcDataset
|
||||
|
||||
__all__ = [
|
||||
"Arc1DConfig",
|
||||
"Arc1DDataset",
|
||||
"Arc1DCurriculum",
|
||||
"ArcAgiConfig",
|
||||
"ArcAgiDataset",
|
||||
"ReArcDataset",
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ from dataclasses import dataclass
|
|||
from random import Random
|
||||
from typing import Optional
|
||||
|
||||
from ..coaching import BaseCurriculum, RangeAttributeDefinition
|
||||
from ..dataset import ProceduralDataset
|
||||
from ..factory import register_dataset
|
||||
|
||||
|
|
@ -108,9 +109,31 @@ class Arc1DDataset(ProceduralDataset):
|
|||
"size": size,
|
||||
"train_examples": train_examples,
|
||||
"test_example": test_example,
|
||||
"difficulty": {
|
||||
"size": (self.config.min_size, self.config.max_size),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class Arc1DCurriculum(BaseCurriculum):
|
||||
"""Curriculum for ARC 1D tasks"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(Arc1DCurriculum.__name__, Arc1DConfig)
|
||||
|
||||
# Define attributes
|
||||
self._define_attributes(
|
||||
RangeAttributeDefinition(
|
||||
name="size",
|
||||
levels=[10, 25, 50, 100],
|
||||
lower_field_name="min_size",
|
||||
upper_field_name="max_size",
|
||||
description="Grid size",
|
||||
ensure_interval=True,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# Register the dataset
|
||||
register_dataset(DATASET_NAME, Arc1DDataset, Arc1DConfig)
|
||||
register_dataset(DATASET_NAME, Arc1DDataset, Arc1DConfig, Arc1DCurriculum)
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ Logic tasks for training reasoning capabilities.
|
|||
|
||||
from .aiw import AliceInWonderlandConfig, AliceInWonderlandCurriculum, AliceInWonderlandDataset
|
||||
from .circuit_logic import CircuitLogicConfig, CircuitLogicCurriculum, CircuitLogicDataset
|
||||
from .knights_knaves import KnightsKnavesConfig, KnightsKnavesDataset
|
||||
from .knights_knaves import KnightsKnavesConfig, KnightsKnavesCurriculum, KnightsKnavesDataset
|
||||
from .propositional_logic import PropositionalLogicConfig, PropositionalLogicCurriculum, PropositionalLogicDataset
|
||||
from .self_reference import SelfReferenceConfig, SelfReferenceCurriculum, SelfReferenceDataset
|
||||
from .syllogisms import SyllogismConfig, SyllogismDataset
|
||||
|
|
@ -31,4 +31,5 @@ __all__ = [
|
|||
"CircuitLogicCurriculum",
|
||||
"KnightsKnavesConfig",
|
||||
"KnightsKnavesDataset",
|
||||
"KnightsKnavesCurriculum",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ import numpy as np
|
|||
|
||||
from reasoning_gym.factory import ProceduralDataset, register_dataset
|
||||
|
||||
from ..coaching import BaseCurriculum, ScalarAttributeDefinition
|
||||
|
||||
DATASET_NAME = "knights_knaves"
|
||||
|
||||
COMMON_NAMES = [
|
||||
|
|
@ -462,6 +464,11 @@ class KnightsKnavesDataset(ProceduralDataset):
|
|||
"solution": problem["solution"],
|
||||
"names": formatted["names"],
|
||||
"knight_knave_terms": formatted["knight_knave"],
|
||||
"difficulty": {
|
||||
"n_people": self.config.n_people,
|
||||
"depth_constraint": self.config.depth_constraint,
|
||||
"width_constraint": self.config.width_constraint,
|
||||
},
|
||||
}
|
||||
|
||||
return {"question": question, "answer": answer, "metadata": metadata}
|
||||
|
|
@ -515,4 +522,30 @@ class KnightsKnavesDataset(ProceduralDataset):
|
|||
return 0.0
|
||||
|
||||
|
||||
register_dataset(DATASET_NAME, KnightsKnavesDataset, KnightsKnavesConfig)
|
||||
class KnightsKnavesCurriculum(BaseCurriculum):
|
||||
def __init__(self):
|
||||
super().__init__(KnightsKnavesCurriculum.__name__, KnightsKnavesConfig)
|
||||
|
||||
self._define_attributes(
|
||||
ScalarAttributeDefinition(
|
||||
name="n_people",
|
||||
levels=[2, 3, 4, 5],
|
||||
description="Number of people in the problem",
|
||||
field_name="n_people",
|
||||
),
|
||||
ScalarAttributeDefinition(
|
||||
name="depth_constraint",
|
||||
levels=[2, 3, 4, 5],
|
||||
description="Depth of the problem",
|
||||
field_name="depth_constraint",
|
||||
),
|
||||
ScalarAttributeDefinition(
|
||||
name="width_constraint",
|
||||
levels=[2, 3, 4, 5],
|
||||
description="Width of the problem",
|
||||
field_name="width_constraint",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
register_dataset(DATASET_NAME, KnightsKnavesDataset, KnightsKnavesConfig, KnightsKnavesCurriculum)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ from random import Random
|
|||
|
||||
import pytest
|
||||
|
||||
from reasoning_gym.arc import Arc1DConfig, Arc1DDataset
|
||||
from reasoning_gym.arc import Arc1DConfig, Arc1DCurriculum, Arc1DDataset
|
||||
|
||||
|
||||
def test_arc_1d_config_validation():
|
||||
|
|
@ -41,6 +41,7 @@ def test_arc_1d_items():
|
|||
assert "question" in item
|
||||
assert "answer" in item
|
||||
assert "metadata" in item
|
||||
assert "difficulty" in item["metadata"]
|
||||
|
||||
# Check metadata contents
|
||||
metadata = item["metadata"]
|
||||
|
|
@ -142,3 +143,44 @@ def test_arc_1d_generate_all_tasks():
|
|||
break
|
||||
assert i < 20
|
||||
print(task_name, j, i, x)
|
||||
|
||||
|
||||
def test_arc_1d_curriculum():
|
||||
"""Test the curriculum for complex arithmetic."""
|
||||
curriculum = Arc1DCurriculum()
|
||||
base_value = {"size": 150, "seed": 1}
|
||||
|
||||
base_cfg: Arc1DCurriculum = curriculum.generate_configuration(base_value)
|
||||
|
||||
assert base_cfg.seed == 1
|
||||
assert base_cfg.size == 150
|
||||
assert base_cfg.min_size == 10
|
||||
assert base_cfg.max_size == 25
|
||||
|
||||
# Test and validate increase in levels
|
||||
curriculum.increment_attr_level("size")
|
||||
|
||||
increased_cfg: Arc1DCurriculum = curriculum.generate_configuration(base_value)
|
||||
assert increased_cfg.min_size == 10
|
||||
assert increased_cfg.max_size == 50
|
||||
|
||||
# Test and validate decrease in levels
|
||||
curriculum.decrement_attr_level("size")
|
||||
|
||||
decreased_cfg: Arc1DCurriculum = curriculum.generate_configuration(base_value)
|
||||
assert decreased_cfg.min_size == 10
|
||||
assert decreased_cfg.max_size == 25
|
||||
|
||||
# Test upper bound boundary condition
|
||||
for _ in range(10):
|
||||
curriculum.increment_attr_level("size")
|
||||
upper_bound_cfg: Arc1DCurriculum = curriculum.generate_configuration(base_value)
|
||||
assert upper_bound_cfg.min_size == 10
|
||||
assert upper_bound_cfg.max_size == 100
|
||||
|
||||
# Test lower bound boundary condition
|
||||
for _ in range(10):
|
||||
curriculum.decrement_attr_level("size")
|
||||
lower_bound_cfg: Arc1DCurriculum = curriculum.generate_configuration(base_value)
|
||||
assert lower_bound_cfg.min_size == 10
|
||||
assert lower_bound_cfg.max_size == 25
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
|
||||
from reasoning_gym.logic.knights_knaves import KnightsKnavesConfig, KnightsKnavesDataset
|
||||
from reasoning_gym.logic.knights_knaves import KnightsKnavesConfig, KnightsKnavesCurriculum, KnightsKnavesDataset
|
||||
|
||||
|
||||
def test_config_validation():
|
||||
|
|
@ -234,3 +234,42 @@ def test_depth_constraint_specific_problem():
|
|||
solutions = KnightsKnavesDataset.find_solution(test_statements)
|
||||
assert len(solutions) == 1, "Should have exactly one solution"
|
||||
assert solutions[0] == (True, False, False)
|
||||
|
||||
|
||||
def test_curriculum():
|
||||
curriculum = KnightsKnavesCurriculum()
|
||||
|
||||
assert len(curriculum.attributes) == 3
|
||||
|
||||
base_value = {"size": 150, "seed": 1}
|
||||
|
||||
base_cfg = curriculum.generate_configuration(base_value)
|
||||
|
||||
assert base_cfg.seed == 1
|
||||
assert base_cfg.size == 150
|
||||
assert base_cfg.n_people == 2
|
||||
assert base_cfg.depth_constraint == 2
|
||||
|
||||
# test incrementing attribute levels
|
||||
curriculum.increment_attr_level("n_people")
|
||||
curriculum.increment_attr_level("depth_constraint")
|
||||
curriculum.increment_attr_level("width_constraint")
|
||||
|
||||
increased_cfg = curriculum.generate_configuration(base_value)
|
||||
assert increased_cfg.n_people == 3
|
||||
assert increased_cfg.depth_constraint == 3
|
||||
assert increased_cfg.width_constraint == 3
|
||||
# test decrementing attribute level
|
||||
curriculum.decrement_attr_level("n_people")
|
||||
partially_decreased_cfg = curriculum.generate_configuration(base_value)
|
||||
assert partially_decreased_cfg.n_people == 2
|
||||
assert partially_decreased_cfg.depth_constraint == 3
|
||||
assert partially_decreased_cfg.width_constraint == 3
|
||||
|
||||
curriculum.increment_attr_level("n_people")
|
||||
curriculum.increment_attr_level("depth_constraint")
|
||||
curriculum.increment_attr_level("width_constraint")
|
||||
increased_cfg = curriculum.generate_configuration(base_value)
|
||||
assert increased_cfg.n_people == 3
|
||||
assert increased_cfg.depth_constraint == 4
|
||||
assert increased_cfg.width_constraint == 4
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue