diff --git a/reasoning_gym/core/attribute_monitor.py b/reasoning_gym/core/attribute_monitor.py new file mode 100644 index 00000000..830344c1 --- /dev/null +++ b/reasoning_gym/core/attribute_monitor.py @@ -0,0 +1,95 @@ +from typing import Dict, List, Any +from dataclasses import dataclass +from collections import defaultdict +import numpy as np + +@dataclass +class AttributeMonitor: + """Monitors performance for a specific attribute.""" + window_size: int = 10 # Number of recent problems to track + warmup_count: int = 10 # Number of problems before starting analysis # TODO: Implement warmup (not just level_history as can go back to level) + degradation_threshold: float = 0.9 # Threshold for degradation + std_plateau_threshold: float = 0.1 # Threshold for plateau + + def __post_init__(self): + self.curriculum = None # Will be set during initialization + self.attribute_name = None # Will be set during initialization + self.recent_scores: List[float] = [] # List of recent accuracy scores + self.level_history: Dict[int, List[float]] = defaultdict(list) # Scores for each difficulty level + self.best_scores: Dict[int, float] = {} # Best smoothed score achieved at each level + + def initialize(self, curriculum: Any, attribute_name: str): + """Initialize monitor with curriculum and attribute.""" + self.curriculum = curriculum + self.attribute_name = attribute_name + self.set_level(curriculum.get_attr_level(attribute_name)) + + @property + def current_level(self) -> int: + """Get current level from curriculum.""" + return self.curriculum.get_attr_level(self.attribute_name) + + def increment_level(self) -> bool: + """Increment difficulty level using curriculum.""" + if self.curriculum.increment_attr_level(self.attribute_name): + self.recent_scores = [] # Reset scores for new level + return True + return False + + def decrement_level(self) -> bool: + """Decrement difficulty level using curriculum.""" + if self.curriculum.decrement_attr_level(self.attribute_name): + self.recent_scores = [] # Reset scores for new level + return True + return False + + def set_level(self, level: int): + """Set difficulty level using curriculum.""" + self.curriculum.set_attr_level(self.attribute_name, level) + self.recent_scores = [] # Reset scores for new level + + def add_score(self, score: float): + """Add a new score and update metrics.""" + self.recent_scores.append(score) + if len(self.recent_scores) > self.window_size: + self.recent_scores.pop(0) + + self.level_history[self.current_level].append(score) + + # Update best score if current moving average is higher + if len(self.recent_scores) >= self.window_size: + current_avg = np.mean(self.recent_scores) + self.best_scores[self.current_level] = max( + current_avg, + self.best_scores.get(self.current_level, float('-inf')) + ) + + def get_current_accuracy(self) -> float: + """Get the current moving average accuracy.""" + if len(self.recent_scores) < self.window_size: + return 0.0 + return np.mean(self.recent_scores) + + # TODO: is_*, addscore merge + def is_improving(self) -> bool: + """Check if performance is improving.""" + if len(self.recent_scores) < self.window_size: + return False + current_avg = np.mean(self.recent_scores) + return current_avg > self.best_scores.get(self.current_level, float('-inf')) + + def is_plateau(self) -> bool: + """Check if performance has plateaued.""" + if len(self.recent_scores) < self.window_size: + return False + + recent_std = np.std(self.recent_scores) + return recent_std < self.std_plateau_threshold + + def is_degrading(self) -> bool: + """Check if performance is degrading.""" + if len(self.recent_scores) < self.window_size: + return False + + current_avg = np.mean(self.recent_scores) + return current_avg < self.best_scores.get(self.current_level, float('-inf')) * self.degradation_threshold \ No newline at end of file diff --git a/reasoning_gym/core/attributes.py b/reasoning_gym/core/attributes.py index 658bcaf9..92ea836e 100644 --- a/reasoning_gym/core/attributes.py +++ b/reasoning_gym/core/attributes.py @@ -16,12 +16,12 @@ class AttributeType(Enum): class AttributeDefinition: """Defines a difficulty attribute with its possible levels and properties""" levels: List[Any] - current_level: int + default_level: int description: str attr_type: AttributeType = AttributeType.STATIC # Default to static @classmethod - def validate_attributes(cls, attributes: Dict[str, 'AttributeDefinition'], valid_types: Set[AttributeType], curriculum: Optional[str] = None) -> None: + def validate_attributes(cls, attributes: Dict[str, 'AttributeDefinition'], valid_types: Set[AttributeType], curriculum: str) -> None: """ Validates that all attributes use types from the valid_types set. @@ -31,12 +31,94 @@ class AttributeDefinition: curriculum: A string identifier for the curriculum or class that owns these attributes Raises: - ValueError: If any attribute uses an invalid type + ValueError: If any attribute uses an invalid type or has invalid configuration """ + if not valid_types: + raise ValueError(f"Curriculum {curriculum} has no valid attribute types defined") + + if not attributes: + raise ValueError(f"Curriculum {curriculum} has no attributes defined") + for name, attr in attributes.items(): + # Check attribute type is valid if attr.attr_type not in valid_types: curriculum_class = f"{curriculum}." if curriculum else "" raise ValueError( f"Attribute '{curriculum_class}{name}' uses type {attr.attr_type.value} " f"which is not in the curriculum's valid types: {[t.value for t in valid_types]}" - ) \ No newline at end of file + ) + + # Check levels exist + if not attr.levels: + raise ValueError(f"Attribute '{curriculum}.{name}' has no levels defined") + + # Check default level is valid + if not 0 <= attr.default_level < len(attr.levels): + raise ValueError( + f"Invalid default level: {attr.default_level} for attribute '{curriculum}.{name}'. " + f"Must be between 0 and {len(attr.levels)-1}" + ) + + @classmethod + def check_attribute_exists(cls, attributes: Dict[str, 'AttributeDefinition'], attr_name: str, curriculum: str) -> 'AttributeDefinition': + """ + Check if attribute exists and return its definition. + + Args: + attributes: Dictionary of attribute definitions + attr_name: Name of the attribute to check + curriculum: Name of the curriculum + + Returns: + The AttributeDefinition for the attribute + + Raises: + KeyError: If attribute doesn't exist + """ + if attr_name not in attributes: + raise KeyError(f"Attribute '{curriculum}.{attr_name}' does not exist") + return attributes[attr_name] + + @classmethod + def validate_level(cls, attr: 'AttributeDefinition', level: int, attr_name: str, curriculum: str) -> None: + """ + Validate that a level is valid for an attribute. + + Args: + attr: The attribute definition + level: Level to validate + attr_name: Name of the attribute + curriculum: Name of the curriculum + + Raises: + ValueError: If level is invalid + """ + # TODO: if > set as [-1], if <0 set as [0] + if not 0 <= level < len(attr.levels): + raise ValueError( + f"Invalid level: {level} for attribute '{curriculum}.{attr_name}'. " + f"Must be between 0 and {len(attr.levels)-1}" + ) + + @classmethod + def get_level_value(cls, attr: 'AttributeDefinition', level: int, attr_name: str, curriculum: str) -> Any: + """ + Get the value for an attribute at a specific level based on its type. + + Args: + attr: The attribute definition + level: Level to get value for + attr_name: Name of the attribute + curriculum: Name of the curriculum + + Returns: + Value for the attribute based on its level and type + """ + if attr.attr_type == AttributeType.STATIC: + return attr.levels[level] + elif attr.attr_type == AttributeType.UBOUND: + return attr.levels[level] + elif attr.attr_type == AttributeType.APPEND: + return attr.levels[:level + 1] + + raise ValueError(f"Unknown attribute type: {attr.attr_type} for attribute '{curriculum}.{attr_name}'") \ No newline at end of file diff --git a/reasoning_gym/core/base_curriculum.py b/reasoning_gym/core/base_curriculum.py new file mode 100644 index 00000000..5188c327 --- /dev/null +++ b/reasoning_gym/core/base_curriculum.py @@ -0,0 +1,141 @@ +""" +Base class for exercise curricula that defines the interface and common functionality. +""" + +from typing import Dict, List, Any +from dataclasses import dataclass +from reasoning_gym.core.attributes import AttributeDefinition, AttributeType + +@dataclass +class Template: + """Defines a template for generating questions and answers""" + question: str + answer: str + metadata: Dict[str, Any] + +class BaseCurriculum: + """Base class for all exercise curricula""" + + def __init__(self, name: str): + self.name = name + self._attributes: Dict[str, AttributeDefinition] = {} + self._templates: List[Template] = [] + self._valid_types: set[AttributeType] = set() + self._current_levels: Dict[str, int] = {} + + # Let child class fill in the structure + self._init_curriculum() + + # Validate the filled structure + self._validate() + + # TODO: Why? + def _init_curriculum(self) -> None: + """ + Initialize curriculum-specific attributes and templates. + Must be implemented by subclasses to fill in the pre-defined structure. + """ + raise NotImplementedError("Subclasses must implement _init_curriculum()") + + def _validate(self) -> None: + """Validate the curriculum configuration""" + # Validate attributes + AttributeDefinition.validate_attributes( + self._attributes, + self._valid_types, + curriculum=self.name + ) + # Validate templates exist + if not self._templates: + raise ValueError(f"Curriculum {self.name} has no templates defined") + + @property + def attributes(self) -> Dict[str, AttributeDefinition]: + """Get the curriculum's attributes""" + return self._attributes + + @property + def templates(self) -> List[Template]: + """Get the curriculum's templates""" + return self._templates + + def get_attr_level(self, attr_name: str) -> int: + """ + Get the current level for an attribute. + + Args: + attr_name: Name of the attribute + + Returns: + Current level index for the attribute + """ + attr = AttributeDefinition.check_attribute_exists(self._attributes, attr_name, self.name) + return self._current_levels.get(attr_name, attr.default_level) + + def get_attr_value(self, attr_name: str) -> Any: + """ + Get the current value for an attribute based on its level. + + Args: + attr_name: Name of the attribute + + Returns: + Current value for the attribute based on its level and type + """ + attr = AttributeDefinition.check_attribute_exists(self._attributes, attr_name, self.name) + level = self.get_attr_level(attr_name) + return AttributeDefinition.get_level_value(attr, level, attr_name, self.name) + + def set_attr_level(self, attr_name: str, level: int) -> None: + """ + Set the level for an attribute. + + Args: + attr_name: Name of the attribute + level: New level index + """ + attr = AttributeDefinition.check_attribute_exists(self._attributes, attr_name, self.name) + AttributeDefinition.validate_level(attr, level, attr_name, self.name) + self._current_levels[attr_name] = level + + def increment_attr_level(self, attr_name: str) -> bool: + """ + Increment the level of an attribute if possible. + + Args: + attr_name: Name of the attribute to increment + + Returns: + bool: True if level was incremented, False if already at max level + + Raises: + KeyError: If attribute doesn't exist + """ + attr = AttributeDefinition.check_attribute_exists(self._attributes, attr_name, self.name) + current_level = self.get_attr_level(attr_name) + + if current_level < len(attr.levels) - 1: + self.set_attr_level(attr_name, current_level + 1) + return True + return False + + def decrement_attr_level(self, attr_name: str) -> bool: + """ + Decrement the level of an attribute if possible. + + Args: + attr_name: Name of the attribute to decrement + + Returns: + bool: True if level was decremented, False if already at min level + + Raises: + KeyError: If attribute doesn't exist + """ + attr = AttributeDefinition.check_attribute_exists(self._attributes, attr_name, self.name) + current_level = self.get_attr_level(attr_name) + + if current_level > 0: + self.set_attr_level(attr_name, current_level - 1) + return True + return False diff --git a/reasoning_gym/curricula/arithmetic/chain_sum_curriculum.py b/reasoning_gym/curricula/arithmetic/chain_sum_curriculum.py index 3298733b..69809af5 100644 --- a/reasoning_gym/curricula/arithmetic/chain_sum_curriculum.py +++ b/reasoning_gym/curricula/arithmetic/chain_sum_curriculum.py @@ -1,93 +1,90 @@ """ Curriculum definition for the ChainSum exercise. -This file defines the templates, attributes, and difficulty levels for generating chain sum problems. """ -from typing import Dict, List, Union, Any +from typing import Dict, List, Any +from reasoning_gym.core.base_curriculum import BaseCurriculum, Template from reasoning_gym.core.attributes import AttributeDefinition, AttributeType -# Define which attribute types are valid for this curriculum -ATTRIBUTE_TYPES = { - AttributeType.STATIC, # For base numbers - AttributeType.UBOUND, # For ranges like digits and terms - AttributeType.APPEND # For operators and notations -} +class ChainSumCurriculum(BaseCurriculum): + def __init__(self): + super().__init__("ChainSumCurriculum") -# Curriculum definition -CURRICULUM_NAME = "ChainSumCurriculum" - -ATTRIBUTES = { - "num_digits": AttributeDefinition( - levels=[1, 2, 3, 4], - current_level=0, # Start with 1-digit numbers - description="Number of digits in each operand", - attr_type=AttributeType.UBOUND - ), - - "num_decimals": AttributeDefinition( - levels=[0, 1, 2], - current_level=0, # Start with integers - description="Number of decimal places in operands", - attr_type=AttributeType.UBOUND - ), - - "operators": AttributeDefinition( - levels=['+', '-', '*', '/', '**'], - current_level=0, # Start with basic operators - description="Set of operators that can be used, each level includes all previous operators", - attr_type=AttributeType.APPEND - ), - - "max_terms": AttributeDefinition( - levels=[2, 3, 4, 5], - current_level=0, # Start with 2 terms - description="Maximum number of terms in the expression", - attr_type=AttributeType.UBOUND - ), - - "sign": AttributeDefinition( - levels=['', '+', '-'], - current_level=0, # Start without negatives - description="Whether negative numbers are allowed", - attr_type=AttributeType.APPEND - ), - - "notation": AttributeDefinition( - levels=["regular", "scientific"], - current_level=0, - description="The notation to use for the expression", - attr_type=AttributeType.APPEND - ), - - "base": AttributeDefinition( - levels=[10, 2, 16], - current_level=0, - description="The base to use for the expression", - attr_type=AttributeType.STATIC - ) -} - -# Validate attributes use allowed types (and include the curriculum name in error messages) -AttributeDefinition.validate_attributes(ATTRIBUTES, ATTRIBUTE_TYPES, curriculum=CURRICULUM_NAME) - -# Template definitions -TEMPLATES = [ - { - "question": "What is {expression}?", - "answer": "{result}", - "metadata": {"type": "direct"} - }, - { - "question": "Calculate the following: {expression}", - "answer": "{result}", - "metadata": {"type": "direct"} - }, - { - "question": "Solve {expression}", - "answer": "{result}", - "metadata": {"type": "direct"} - } -] + def _init_curriculum(self) -> None: + """Initialize the ChainSum curriculum configuration""" + + # Define valid attribute types + self._valid_types = { + AttributeType.STATIC, # For base numbers + AttributeType.UBOUND, # For ranges like digits and terms + AttributeType.APPEND # For operators and notations + } + + # Define attributes + self._attributes = { + "num_digits": AttributeDefinition( + levels=[2, 4, 10], + default_level=0, # Start with 1-digit numbers + description="Number of digits in each operand", + attr_type=AttributeType.UBOUND + # distribution_type + ), + "num_decimals": AttributeDefinition( + levels=[0, 1, 2], + default_level=0, # Start with integers + description="Number of decimal places in operands", + attr_type=AttributeType.UBOUND + ), + "operators": AttributeDefinition( + levels=['+', '-', '*', '/', '**'], + default_level=0, # Start with basic operators + description="Set of operators that can be used", + attr_type=AttributeType.APPEND + ), + "max_terms": AttributeDefinition( + levels=[2, 3, 4, 5], + default_level=0, # Start with 2 terms + description="Maximum number of terms in the expression", + attr_type=AttributeType.UBOUND + ), + "sign": AttributeDefinition( + levels=['', '+', '-'], + default_level=0, # Start without negatives + description="Whether negative numbers are allowed", + attr_type=AttributeType.APPEND + ), + "notation": AttributeDefinition( + levels=["regular", "scientific"], + default_level=0, + description="The notation to use for the expression", + attr_type=AttributeType.APPEND + ), + "base": AttributeDefinition( + levels=[10, 2, 16], + default_level=0, + description="The base to use for the expression", + attr_type=AttributeType.STATIC + ) + } + + # Define templates + self._templates = [ + Template( + question="What is {expression}?", + answer="{result}", + metadata={"type": "direct"} + ), + Template( + question="Calculate the following: {expression}", + answer="{result}", + metadata={"type": "direct"} + ), + Template( + question="Solve {expression}", + answer="{result}", + metadata={"type": "direct"} + ) + ] # Generator functions for placeholders def generate_expression(attributes: Dict[str, Any]) -> Dict[str, str]: @@ -102,4 +99,4 @@ def generate_expression(attributes: Dict[str, Any]) -> Dict[str, str]: Dict containing the expression and result as strings """ # This will be implemented in the Exercise class - pass \ No newline at end of file + pass \ No newline at end of file diff --git a/reasoning_gym/principal.py b/reasoning_gym/principal.py new file mode 100644 index 00000000..727b7918 --- /dev/null +++ b/reasoning_gym/principal.py @@ -0,0 +1,87 @@ +from typing import Dict, Optional, Any +from enum import Enum +import logging +from collections import defaultdict +import numpy as np +from reasoning_gym.core.attribute_monitor import AttributeMonitor + +class CurriculumMode(Enum): + """Mode of curriculum operation for the Principal.""" + PREDEFINED = "predefined" # Follow a pre-defined curriculum + DYNAMIC = "dynamic" # Dynamically adjust based on performance + +class Principal: + """Manages exercise difficulty and curriculum progression.""" + + def __init__(self, mode: CurriculumMode = CurriculumMode.DYNAMIC): + self.exercises = {} # type: Dict[str, Any] # Exercise instances + self.exercise_curricula = {} # type: Dict[str, Any] # Loaded curricula + self.current_levels = defaultdict(dict) # Current difficulty levels + self.performance_monitors = defaultdict(dict) # Attribute monitors + self.curriculum_mode = mode + self.plateau_threshold = 0.8 + self.logger = logging.getLogger(__name__) + + def register_exercise(self, exercise_name: str, exercise_instance: Any, + curriculum: Any) -> None: + """Register a new exercise with its curriculum.""" + self.exercises[exercise_name] = exercise_instance + self.exercise_curricula[exercise_name] = curriculum + + # Initialize monitors for each attribute + for attr_name, attr_def in curriculum.attributes.items(): + monitor = AttributeMonitor() + monitor.initialize(curriculum, attr_name) + self.performance_monitors[exercise_name][attr_name] = monitor + + self.logger.info(f"Registered exercise: {exercise_name}") + + def generate_problem(self, exercise_name: str) -> tuple: + """Generate a problem from the specified exercise.""" + if exercise_name not in self.exercises: + raise KeyError(f"Exercise {exercise_name} not registered") + + exercise = self.exercises[exercise_name] + + # Set current attribute levels before generation + for attr_name, monitor in self.performance_monitors[exercise_name].items(): + exercise.set_attribute_level(attr_name, monitor.current_level) + + return exercise.generate() + + # TODO:Implement predefined + def update_performance(self, exercise_name: str, attribute_name: str, + score: float) -> None: + """Update performance metrics for an attribute.""" + + monitor = self.performance_monitors[exercise_name][attribute_name] + monitor.add_score(score) + + if self.curriculum_mode == CurriculumMode.DYNAMIC: + self._adjust_difficulty(exercise_name, attribute_name) + + # TODO: Implement representation + def _adjust_difficulty(self, exercise_name: str, attribute_name: str) -> None: + """Adjust difficulty based on performance metrics.""" + monitor = self.performance_monitors[exercise_name][attribute_name] + + # Implementation of the adjustment logic + if monitor.is_improving(): + # Keep current level while improving + return + elif monitor.is_plateau(): + current_acc = monitor.get_current_accuracy() + # Try to increase difficulty if accuracy is high + if current_acc > self.plateau_threshold: + if monitor.increment_level(): + self.logger.info( + f"Increasing difficulty for {exercise_name}.{attribute_name} " + f"to level {monitor.current_level}" + ) + elif monitor.is_degrading(): + # If performance is degrading, decrease difficulty + if monitor.decrement_level(): + self.logger.info( + f"Decreasing difficulty for {exercise_name}.{attribute_name} " + f"to level {monitor.current_level}" + ) \ No newline at end of file diff --git a/tests/core/test_dataset.py b/tests/test_dataset.py similarity index 100% rename from tests/core/test_dataset.py rename to tests/test_dataset.py