mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-26 17:13:17 +00:00
Add Coaching & ScoreBoard class (result tracking) (#72)
* feat: Add Coach and ScoreBoard classes for performance tracking and difficulty adjustment * feat: Add GroupedScores class to wrap aggregated scores * refactor: Create ScoreStats class with tuple-based score statistics * feat: Add unit test for Coach with CompositeDataset and multiple datasets * fix: Add difficulty metadata to leg counting dataset * feat: Add clear() method to ScoreBoard to reset all stored data * feat: Add __len__ method to ScoreBoard to return number of scores * feat: Add update_dataset_config method to CompositeDataset * cleanup __init__ & imports
This commit is contained in:
parent
05e2681ada
commit
a607db79f7
18 changed files with 549 additions and 39 deletions
|
|
@ -1,10 +1,5 @@
|
|||
"""
|
||||
Arithmetic tasks for training reasoning capabilities:
|
||||
- Basic arithmetic
|
||||
- Chain sums
|
||||
- Word problems
|
||||
- Leg counting
|
||||
- Time intervals
|
||||
"""
|
||||
|
||||
from .basic_arithmetic import BasicArithmeticDataset, BasicArithmeticDatasetConfig
|
||||
|
|
@ -21,13 +16,10 @@ from .time_intervals import TimeIntervalsConfig, TimeIntervalsDataset
|
|||
__all__ = [
|
||||
"BasicArithmeticDataset",
|
||||
"BasicArithmeticDatasetConfig",
|
||||
"basic_arithmetic_dataset",
|
||||
"ChainSum",
|
||||
"ChainSumConfig",
|
||||
"CalendarArithmeticConfig",
|
||||
"CalendarArithmeticDataset",
|
||||
"Weekday",
|
||||
"CalendarTask",
|
||||
"FractionSimplificationConfig",
|
||||
"FractionSimplificationDataset",
|
||||
"GCDConfig",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import math
|
|||
import random
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, timedelta
|
||||
from enum import Enum, auto
|
||||
from enum import Enum, StrEnum, auto
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from ..factory import ProceduralDataset, register_dataset
|
||||
|
|
@ -38,7 +38,7 @@ class Weekday(Enum):
|
|||
return self.name.capitalize()
|
||||
|
||||
|
||||
class CalendarTask(Enum):
|
||||
class CalendarTask(StrEnum):
|
||||
WEEKDAY_OFFSET = "weekday_offset"
|
||||
WEEKDAY_OF_DATE = "weekday_of_date"
|
||||
WEEKDAY_OF_DATE_FROM_FIRST_DATE = "weekday_of_date_from_first_day"
|
||||
|
|
|
|||
|
|
@ -65,8 +65,10 @@ class ChainSum(ProceduralDataset):
|
|||
"question": f"{expression} =",
|
||||
"answer": str(result),
|
||||
"metadata": {
|
||||
"num_terms": num_terms,
|
||||
"num_digits": num_digits,
|
||||
"difficulty": {
|
||||
"num_terms": num_terms,
|
||||
"num_digits": num_digits,
|
||||
},
|
||||
"expression": expression,
|
||||
},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -111,7 +111,13 @@ class LegCountingDataset(ProceduralDataset):
|
|||
return {
|
||||
"question": question,
|
||||
"answer": str(total_legs),
|
||||
"metadata": {"animals": animals, "total_legs": total_legs},
|
||||
"metadata": {
|
||||
"difficulty": {
|
||||
"num_animals": len(animals),
|
||||
},
|
||||
"animals": animals,
|
||||
"total_legs": total_legs,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
from dataclasses import dataclass
|
||||
from random import Random
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import List, Optional
|
||||
|
||||
from ..factory import ProceduralDataset, register_dataset
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue