mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-28 17:29:39 +00:00
feat: Add --category option to evaluate datasets from a specific category
This commit is contained in:
parent
1d813c9acd
commit
d29a665081
1 changed files with 20 additions and 2 deletions
22
eval/eval.py
22
eval/eval.py
|
|
@ -10,6 +10,7 @@ Usage:
|
|||
Options:
|
||||
--model MODEL Override model specified in config
|
||||
--output-dir DIR Override output directory specified in config
|
||||
--category CATEGORY Evaluate only datasets from this category
|
||||
--max-concurrent NUM Maximum number of concurrent API calls
|
||||
--base-url URL API base URL (default: https://openrouter.ai/api/v1)
|
||||
--save-metadata Save entry metadata in results
|
||||
|
|
@ -390,8 +391,15 @@ class AsyncModelEvaluator:
|
|||
timestamp = self.start_time.strftime("%Y%m%d_%H%M%S")
|
||||
model_name = self.config.model.replace("/", "_")
|
||||
|
||||
# Format directory name with model and timestamp only
|
||||
output_dir = Path(self.config.output_dir) / f"{model_name}_{timestamp}"
|
||||
# Format directory name with model, category (if single category), and timestamp
|
||||
if len(self.config.categories) == 1:
|
||||
# Include category name in the output directory when evaluating a single category
|
||||
category_name = self.config.categories[0].category
|
||||
output_dir = Path(self.config.output_dir) / f"{model_name}_{category_name}_{timestamp}"
|
||||
else:
|
||||
# Original format for multiple categories
|
||||
output_dir = Path(self.config.output_dir) / f"{model_name}_{timestamp}"
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
results_path = None
|
||||
|
|
@ -477,6 +485,7 @@ async def main_async():
|
|||
parser.add_argument("--config", required=True, help="Path to configuration file (YAML or JSON)")
|
||||
parser.add_argument("--model", help="Override model specified in config")
|
||||
parser.add_argument("--output-dir", help="Override output directory specified in config")
|
||||
parser.add_argument("--category", help="Evaluate only datasets from this category")
|
||||
parser.add_argument("--max-concurrent", type=int, help="Maximum number of concurrent API calls")
|
||||
parser.add_argument("--base-url", default="https://openrouter.ai/api/v1", help="API base URL")
|
||||
parser.add_argument(
|
||||
|
|
@ -523,6 +532,15 @@ async def main_async():
|
|||
config.save_metadata = True
|
||||
if args.full_results:
|
||||
config.save_full_results = True
|
||||
|
||||
# Filter categories if --category is specified
|
||||
if args.category:
|
||||
# Keep only the specified category
|
||||
filtered_categories = [cat for cat in config.categories if cat.category == args.category]
|
||||
if not filtered_categories:
|
||||
print(f"Error: Category '{args.category}' not found in configuration")
|
||||
return 1
|
||||
config.categories = filtered_categories
|
||||
|
||||
# Create evaluator
|
||||
evaluator = AsyncModelEvaluator(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue