diff --git a/reasoning_gym/core/attribute_monitor.py b/reasoning_gym/core/attribute_monitor.py index 830344c1..b649b896 100644 --- a/reasoning_gym/core/attribute_monitor.py +++ b/reasoning_gym/core/attribute_monitor.py @@ -1,95 +1,106 @@ -from typing import Dict, List, Any +from typing import Dict, List, Any, Optional, Tuple from dataclasses import dataclass from collections import defaultdict import numpy as np +from enum import Enum + +class PerformanceTrend(Enum): + """Performance trend states for an attribute.""" + INSUFFICIENT_DATA = "insufficient_data" + IMPROVING = "improving" + PLATEAU_HIGH_ACC = "plateau_high_acc" + PLATEAU_LOW_ACC = "plateau_low_acc" + DEGRADING = "degrading" + STABLE = "stable" @dataclass class AttributeMonitor: """Monitors performance for a specific attribute.""" + # TODO: Different vars for different exercises, attributes window_size: int = 10 # Number of recent problems to track warmup_count: int = 10 # Number of problems before starting analysis # TODO: Implement warmup (not just level_history as can go back to level) + high_acc_threshold: float = 0.8 # Threshold for high accuracy degradation_threshold: float = 0.9 # Threshold for degradation std_plateau_threshold: float = 0.1 # Threshold for plateau - + def __post_init__(self): self.curriculum = None # Will be set during initialization self.attribute_name = None # Will be set during initialization self.recent_scores: List[float] = [] # List of recent accuracy scores self.level_history: Dict[int, List[float]] = defaultdict(list) # Scores for each difficulty level self.best_scores: Dict[int, float] = {} # Best smoothed score achieved at each level - + def initialize(self, curriculum: Any, attribute_name: str): """Initialize monitor with curriculum and attribute.""" self.curriculum = curriculum self.attribute_name = attribute_name self.set_level(curriculum.get_attr_level(attribute_name)) - + @property def current_level(self) -> int: """Get current level from curriculum.""" return self.curriculum.get_attr_level(self.attribute_name) - + def increment_level(self) -> bool: """Increment difficulty level using curriculum.""" if self.curriculum.increment_attr_level(self.attribute_name): self.recent_scores = [] # Reset scores for new level return True return False - + def decrement_level(self) -> bool: """Decrement difficulty level using curriculum.""" if self.curriculum.decrement_attr_level(self.attribute_name): self.recent_scores = [] # Reset scores for new level return True return False - + def set_level(self, level: int): """Set difficulty level using curriculum.""" self.curriculum.set_attr_level(self.attribute_name, level) self.recent_scores = [] # Reset scores for new level - - def add_score(self, score: float): - """Add a new score and update metrics.""" + + def add_score(self, score: float) -> PerformanceTrend: + """ + Add a new score and analyze the performance trend. + + Returns: + PerformanceTrend: The current performance trend + """ self.recent_scores.append(score) if len(self.recent_scores) > self.window_size: self.recent_scores.pop(0) - + self.level_history[self.current_level].append(score) - + + # Not enough data to analyze trends + if len(self.recent_scores) < self.window_size: + return PerformanceTrend.INSUFFICIENT_DATA + + current_avg = np.mean(self.recent_scores) + current_best = self.best_scores.get(self.current_level, float('-inf')) + # Update best score if current moving average is higher - if len(self.recent_scores) >= self.window_size: - current_avg = np.mean(self.recent_scores) - self.best_scores[self.current_level] = max( - current_avg, - self.best_scores.get(self.current_level, float('-inf')) - ) - + if current_avg > current_best: + self.best_scores[self.current_level] = current_avg + return PerformanceTrend.IMPROVING + + # Check for plateau + recent_std = np.std(self.recent_scores) + if recent_std < self.std_plateau_threshold: + if current_avg > self.high_acc_threshold: + return PerformanceTrend.PLATEAU_HIGH_ACC + else: + return PerformanceTrend.PLATEAU_LOW_ACC + + # Check for degradation + if current_avg < current_best * self.degradation_threshold: + return PerformanceTrend.DEGRADING + + return PerformanceTrend.STABLE + def get_current_accuracy(self) -> float: """Get the current moving average accuracy.""" if len(self.recent_scores) < self.window_size: return 0.0 - return np.mean(self.recent_scores) - - # TODO: is_*, addscore merge - def is_improving(self) -> bool: - """Check if performance is improving.""" - if len(self.recent_scores) < self.window_size: - return False - current_avg = np.mean(self.recent_scores) - return current_avg > self.best_scores.get(self.current_level, float('-inf')) - - def is_plateau(self) -> bool: - """Check if performance has plateaued.""" - if len(self.recent_scores) < self.window_size: - return False - - recent_std = np.std(self.recent_scores) - return recent_std < self.std_plateau_threshold - - def is_degrading(self) -> bool: - """Check if performance is degrading.""" - if len(self.recent_scores) < self.window_size: - return False - - current_avg = np.mean(self.recent_scores) - return current_avg < self.best_scores.get(self.current_level, float('-inf')) * self.degradation_threshold \ No newline at end of file + return np.mean(self.recent_scores) \ No newline at end of file diff --git a/reasoning_gym/core/attributes.py b/reasoning_gym/core/attributes.py index 92ea836e..2840224f 100644 --- a/reasoning_gym/core/attributes.py +++ b/reasoning_gym/core/attributes.py @@ -35,7 +35,7 @@ class AttributeDefinition: """ if not valid_types: raise ValueError(f"Curriculum {curriculum} has no valid attribute types defined") - + if not attributes: raise ValueError(f"Curriculum {curriculum} has no attributes defined") @@ -47,11 +47,11 @@ class AttributeDefinition: f"Attribute '{curriculum_class}{name}' uses type {attr.attr_type.value} " f"which is not in the curriculum's valid types: {[t.value for t in valid_types]}" ) - + # Check levels exist if not attr.levels: raise ValueError(f"Attribute '{curriculum}.{name}' has no levels defined") - + # Check default level is valid if not 0 <= attr.default_level < len(attr.levels): raise ValueError( @@ -63,15 +63,15 @@ class AttributeDefinition: def check_attribute_exists(cls, attributes: Dict[str, 'AttributeDefinition'], attr_name: str, curriculum: str) -> 'AttributeDefinition': """ Check if attribute exists and return its definition. - + Args: attributes: Dictionary of attribute definitions attr_name: Name of the attribute to check curriculum: Name of the curriculum - + Returns: The AttributeDefinition for the attribute - + Raises: KeyError: If attribute doesn't exist """ @@ -83,13 +83,13 @@ class AttributeDefinition: def validate_level(cls, attr: 'AttributeDefinition', level: int, attr_name: str, curriculum: str) -> None: """ Validate that a level is valid for an attribute. - + Args: attr: The attribute definition level: Level to validate attr_name: Name of the attribute curriculum: Name of the curriculum - + Raises: ValueError: If level is invalid """ @@ -104,13 +104,13 @@ class AttributeDefinition: def get_level_value(cls, attr: 'AttributeDefinition', level: int, attr_name: str, curriculum: str) -> Any: """ Get the value for an attribute at a specific level based on its type. - + Args: attr: The attribute definition level: Level to get value for attr_name: Name of the attribute curriculum: Name of the curriculum - + Returns: Value for the attribute based on its level and type """ @@ -120,5 +120,5 @@ class AttributeDefinition: return attr.levels[level] elif attr.attr_type == AttributeType.APPEND: return attr.levels[:level + 1] - + raise ValueError(f"Unknown attribute type: {attr.attr_type} for attribute '{curriculum}.{attr_name}'") \ No newline at end of file diff --git a/reasoning_gym/core/base_curriculum.py b/reasoning_gym/core/base_curriculum.py index 5188c327..7cfcbb2e 100644 --- a/reasoning_gym/core/base_curriculum.py +++ b/reasoning_gym/core/base_curriculum.py @@ -22,10 +22,10 @@ class BaseCurriculum: self._templates: List[Template] = [] self._valid_types: set[AttributeType] = set() self._current_levels: Dict[str, int] = {} - + # Let child class fill in the structure self._init_curriculum() - + # Validate the filled structure self._validate() @@ -58,38 +58,38 @@ class BaseCurriculum: def templates(self) -> List[Template]: """Get the curriculum's templates""" return self._templates - + def get_attr_level(self, attr_name: str) -> int: """ Get the current level for an attribute. - + Args: attr_name: Name of the attribute - + Returns: Current level index for the attribute """ attr = AttributeDefinition.check_attribute_exists(self._attributes, attr_name, self.name) return self._current_levels.get(attr_name, attr.default_level) - + def get_attr_value(self, attr_name: str) -> Any: """ Get the current value for an attribute based on its level. - + Args: attr_name: Name of the attribute - + Returns: Current value for the attribute based on its level and type """ attr = AttributeDefinition.check_attribute_exists(self._attributes, attr_name, self.name) level = self.get_attr_level(attr_name) return AttributeDefinition.get_level_value(attr, level, attr_name, self.name) - + def set_attr_level(self, attr_name: str, level: int) -> None: """ Set the level for an attribute. - + Args: attr_name: Name of the attribute level: New level index @@ -101,19 +101,19 @@ class BaseCurriculum: def increment_attr_level(self, attr_name: str) -> bool: """ Increment the level of an attribute if possible. - + Args: attr_name: Name of the attribute to increment - + Returns: bool: True if level was incremented, False if already at max level - + Raises: KeyError: If attribute doesn't exist """ attr = AttributeDefinition.check_attribute_exists(self._attributes, attr_name, self.name) current_level = self.get_attr_level(attr_name) - + if current_level < len(attr.levels) - 1: self.set_attr_level(attr_name, current_level + 1) return True @@ -122,19 +122,19 @@ class BaseCurriculum: def decrement_attr_level(self, attr_name: str) -> bool: """ Decrement the level of an attribute if possible. - + Args: attr_name: Name of the attribute to decrement - + Returns: bool: True if level was decremented, False if already at min level - + Raises: KeyError: If attribute doesn't exist """ attr = AttributeDefinition.check_attribute_exists(self._attributes, attr_name, self.name) current_level = self.get_attr_level(attr_name) - + if current_level > 0: self.set_attr_level(attr_name, current_level - 1) return True diff --git a/reasoning_gym/core/exercise_registrar.py b/reasoning_gym/core/exercise_registrar.py new file mode 100644 index 00000000..d763e248 --- /dev/null +++ b/reasoning_gym/core/exercise_registrar.py @@ -0,0 +1,44 @@ +import logging +from typing import Dict, Any, Tuple +from reasoning_gym import exercises, curricula + +logger = logging.getLogger(__name__) # TODO: Why only here + +class ExerciseRegistrar: + """Handles registration of exercises and curricula.""" + + @staticmethod + def register_all() -> Dict[str, Tuple[Any, Any]]: + """ + Register all exercises and their curricula. + Returns dict of {exercise_name: (exercise_instance, curriculum_instance)}. + """ + registered = {} + + # Get all Dataset classes from exercises module + for exercise_name in exercises.__all__: + if exercise_name.endswith('Dataset'): + exercise_class = getattr(exercises, exercise_name) + exercise_base = exercise_name[:-7] # Remove 'Dataset' + curriculum_name = f"{exercise_base}Curriculum" + + if hasattr(curricula, curriculum_name): + try: + curriculum_class = getattr(curricula, curriculum_name) + + # Create instances + exercise_instance = exercise_class() + curriculum_instance = curriculum_class() + + # Convert CamelCase to snake_case for exercise name + exercise_name = ''.join([f'_{c.lower()}' if c.isupper() else c + for c in exercise_base]).lstrip('_') + + registered[exercise_name] = (exercise_instance, curriculum_instance) + logger.info(f"ExerciseRegistrar: Registered exercise: {exercise_name}") + except Exception as e: + logger.error(f"ExerciseRegistrar: Error instantiating {exercise_name}: {e}", exc_info=True) + else: + logger.warning(f"ExerciseRegistrar: No curriculum found for {exercise_name}") + + return registered \ No newline at end of file diff --git a/reasoning_gym/curricula/__init__.py b/reasoning_gym/curricula/__init__.py new file mode 100644 index 00000000..b94e985a --- /dev/null +++ b/reasoning_gym/curricula/__init__.py @@ -0,0 +1,17 @@ +from .algebra import * +from .algorithmic import * +from .arithmetic import * +from .code import * +from .cognition import * +from .games import * +from .geometry import * +from .graphs import * +from .logic import * + +# Re-export all Curriculum classes +__all__ = [] +for module in [ + algebra, algorithmic, arithmetic, code, + cognition, games, geometry, graphs, logic +]: + __all__.extend([name for name in module.__all__ if name.endswith('Curriculum')]) \ No newline at end of file diff --git a/reasoning_gym/curricula/arithmetic/__init__.py b/reasoning_gym/curricula/arithmetic/__init__.py new file mode 100644 index 00000000..b2c6d212 --- /dev/null +++ b/reasoning_gym/curricula/arithmetic/__init__.py @@ -0,0 +1,21 @@ +from .basic_arithmetic_curriculum import BasicArithmeticCurriculum +from .calendar_arithmetic_curriculum import CalendarArithmeticCurriculum +from .chain_sum_curriculum import ChainSumCurriculum +from .fraction_simplification_curriculum import FractionSimplificationCurriculum +from .gcd_curriculum import GcdCurriculum +from .lcm_curriculum import LcmCurriculum +from .leg_counting_curriculum import LegCountingCurriculum +from .prime_factorization_curriculum import PrimeFactorizationCurriculum +from .time_intervals_curriculum import TimeIntervalsCurriculum + +__all__ = [ + "BasicArithmeticCurriculum", + "CalendarArithmeticCurriculum", + "ChainSumCurriculum", + "FractionSimplificationCurriculum", + "GcdCurriculum", + "LcmCurriculum", + "LegCountingCurriculum", + "PrimeFactorizationCurriculum", + "TimeIntervalsCurriculum", +] \ No newline at end of file diff --git a/reasoning_gym/curricula/arithmetic/chain_sum_curriculum.py b/reasoning_gym/curricula/arithmetic/chain_sum_curriculum.py index 69809af5..e9e66dbc 100644 --- a/reasoning_gym/curricula/arithmetic/chain_sum_curriculum.py +++ b/reasoning_gym/curricula/arithmetic/chain_sum_curriculum.py @@ -12,14 +12,14 @@ class ChainSumCurriculum(BaseCurriculum): def _init_curriculum(self) -> None: """Initialize the ChainSum curriculum configuration""" - + # Define valid attribute types self._valid_types = { AttributeType.STATIC, # For base numbers AttributeType.UBOUND, # For ranges like digits and terms AttributeType.APPEND # For operators and notations } - + # Define attributes self._attributes = { "num_digits": AttributeDefinition( @@ -66,7 +66,7 @@ class ChainSumCurriculum(BaseCurriculum): attr_type=AttributeType.STATIC ) } - + # Define templates self._templates = [ Template( @@ -91,10 +91,10 @@ def generate_expression(attributes: Dict[str, Any]) -> Dict[str, str]: """ Generates an expression and its result based on current attribute levels. This is a placeholder - actual implementation will be in the Exercise class. - + Args: attributes: Dictionary of current attribute levels - + Returns: Dict containing the expression and result as strings """ diff --git a/reasoning_gym/exercises/__init__.py b/reasoning_gym/exercises/__init__.py new file mode 100644 index 00000000..31b5b493 --- /dev/null +++ b/reasoning_gym/exercises/__init__.py @@ -0,0 +1,17 @@ +from .algebra import * +from .algorithmic import * +from .arithmetic import * +from .code import * +from .cognition import * +from .games import * +from .geometry import * +from .graphs import * +from .logic import * + +# Re-export all Dataset classes +__all__ = [] +for module in [ + algebra, algorithmic, arithmetic, code, + cognition, games, geometry, graphs, logic +]: + __all__.extend([name for name in module.__all__ if name.endswith('Dataset')]) \ No newline at end of file diff --git a/reasoning_gym/exercises/arithmetic/__init__.py b/reasoning_gym/exercises/arithmetic/__init__.py index 9e1a5bc2..02d3c28d 100644 --- a/reasoning_gym/exercises/arithmetic/__init__.py +++ b/reasoning_gym/exercises/arithmetic/__init__.py @@ -7,36 +7,24 @@ Arithmetic tasks for training reasoning capabilities: - Time intervals """ -from .basic_arithmetic import BasicArithmeticDataset, BasicArithmeticDatasetConfig -from .calendar_arithmetic import CalendarArithmeticConfig, CalendarArithmeticDataset -from .chain_sum import ChainSum, ChainSumConfig -from .fraction_simplification import FractionSimplificationConfig, FractionSimplificationDataset -from .gcd import GCDConfig, GCDDataset -from .lcm import LCMConfig, LCMDataset -from .leg_counting import LegCountingConfig, LegCountingDataset -from .prime_factorization import PrimeFactorizationConfig, PrimeFactorizationDataset -from .time_intervals import TimeIntervalsConfig, TimeIntervalsDataset +from .basic_arithmetic import BasicArithmeticDataset +from .calendar_arithmetic import CalendarArithmeticDataset +from .chain_sum import ChainSumDataset +from .fraction_simplification import FractionSimplificationDataset +from .gcd import GcdDataset +from .lcm import LcmDataset +from .leg_counting import LegCountingDataset +from .prime_factorization import PrimeFactorizationDataset +from .time_intervals import TimeIntervalsDataset __all__ = [ "BasicArithmeticDataset", - "BasicArithmeticDatasetConfig", - "basic_arithmetic_dataset", - "ChainSum", - "ChainSumConfig", - "CalendarArithmeticConfig", "CalendarArithmeticDataset", - "Weekday", - "CalendarTask", - "FractionSimplificationConfig", + "ChainSumDataset", "FractionSimplificationDataset", - "GCDConfig", - "GCDDataset", - "LCMConfig", - "LCMDataset", - "LegCountingConfig", + "GcdDataset", + "LcmDataset", "LegCountingDataset", - "PrimeFactorizationConfig", "PrimeFactorizationDataset", - "TimeIntervalsConfig", "TimeIntervalsDataset", ] diff --git a/reasoning_gym/principal.py b/reasoning_gym/principal.py index 727b7918..be784d67 100644 --- a/reasoning_gym/principal.py +++ b/reasoning_gym/principal.py @@ -4,6 +4,8 @@ import logging from collections import defaultdict import numpy as np from reasoning_gym.core.attribute_monitor import AttributeMonitor +from reasoning_gym.core.exercise_registrar import ExerciseRegistrar +from reasoning_gym.core.attribute_monitor import PerformanceTrend class CurriculumMode(Enum): """Mode of curriculum operation for the Principal.""" @@ -12,76 +14,75 @@ class CurriculumMode(Enum): class Principal: """Manages exercise difficulty and curriculum progression.""" - + def __init__(self, mode: CurriculumMode = CurriculumMode.DYNAMIC): self.exercises = {} # type: Dict[str, Any] # Exercise instances self.exercise_curricula = {} # type: Dict[str, Any] # Loaded curricula self.current_levels = defaultdict(dict) # Current difficulty levels self.performance_monitors = defaultdict(dict) # Attribute monitors self.curriculum_mode = mode - self.plateau_threshold = 0.8 self.logger = logging.getLogger(__name__) - + + # Auto-register exercises + registered = ExerciseRegistrar.register_all() + for exercise_name, (exercise, curriculum) in registered.items(): + self.register_exercise(exercise_name, exercise, curriculum) + def register_exercise(self, exercise_name: str, exercise_instance: Any, curriculum: Any) -> None: """Register a new exercise with its curriculum.""" self.exercises[exercise_name] = exercise_instance self.exercise_curricula[exercise_name] = curriculum - + # Initialize monitors for each attribute for attr_name, attr_def in curriculum.attributes.items(): monitor = AttributeMonitor() monitor.initialize(curriculum, attr_name) self.performance_monitors[exercise_name][attr_name] = monitor - - self.logger.info(f"Registered exercise: {exercise_name}") - + + self.logger.info(f"Principal: Registered exercise: {exercise_name} with {len(curriculum.attributes)} attributes") + def generate_problem(self, exercise_name: str) -> tuple: """Generate a problem from the specified exercise.""" if exercise_name not in self.exercises: - raise KeyError(f"Exercise {exercise_name} not registered") - + raise KeyError(f"Principal: Exercise {exercise_name} not registered") + exercise = self.exercises[exercise_name] - - # Set current attribute levels before generation - for attr_name, monitor in self.performance_monitors[exercise_name].items(): - exercise.set_attribute_level(attr_name, monitor.current_level) - - return exercise.generate() - + problem = exercise.generate() + return problem + # TODO:Implement predefined def update_performance(self, exercise_name: str, attribute_name: str, score: float) -> None: """Update performance metrics for an attribute.""" - + monitor = self.performance_monitors[exercise_name][attribute_name] - monitor.add_score(score) - + atrr_trend =monitor.add_score(score) + if self.curriculum_mode == CurriculumMode.DYNAMIC: - self._adjust_difficulty(exercise_name, attribute_name) - + self._adjust_difficulty(exercise_name, attribute_name, trend=atrr_trend) + # TODO: Implement representation - def _adjust_difficulty(self, exercise_name: str, attribute_name: str) -> None: + def _adjust_difficulty(self, exercise_name: str, attribute_name: str, trend: Optional[PerformanceTrend] = None) -> None: """Adjust difficulty based on performance metrics.""" monitor = self.performance_monitors[exercise_name][attribute_name] - - # Implementation of the adjustment logic - if monitor.is_improving(): - # Keep current level while improving - return - elif monitor.is_plateau(): - current_acc = monitor.get_current_accuracy() - # Try to increase difficulty if accuracy is high - if current_acc > self.plateau_threshold: + + # TODO: If plateau and < threshold or degrading, increase representation, if persists n steps, decrease difficulty + match trend: + case PerformanceTrend.IMPROVING: + # Keep current level while improving + return + case PerformanceTrend.PLATEAU_HIGH_ACC: + # Try to increase difficulty if accuracy is high if monitor.increment_level(): self.logger.info( - f"Increasing difficulty for {exercise_name}.{attribute_name} " + f"Principal: Increasing difficulty for {exercise_name}.{attribute_name} " f"to level {monitor.current_level}" ) - elif monitor.is_degrading(): - # If performance is degrading, decrease difficulty - if monitor.decrement_level(): - self.logger.info( - f"Decreasing difficulty for {exercise_name}.{attribute_name} " - f"to level {monitor.current_level}" - ) \ No newline at end of file + case PerformanceTrend.DEGRADING: + # If performance is degrading, decrease difficulty + if monitor.decrement_level(): + self.logger.info( + f"Principal: Decreasing difficulty for {exercise_name}.{attribute_name} " + f"to level {monitor.current_level}" + ) \ No newline at end of file