mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
move r1 configs into r1 yaml/r1 subfolder
This commit is contained in:
parent
e7ae82a831
commit
878f9bbc76
7 changed files with 6 additions and 7 deletions
1
eval/.gitignore
vendored
1
eval/.gitignore
vendored
|
|
@ -1,2 +1 @@
|
|||
results/*
|
||||
!results/summary*
|
||||
|
|
|
|||
2
eval/eval.py
Normal file → Executable file
2
eval/eval.py
Normal file → Executable file
|
|
@ -1,3 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
|
|
@ -112,7 +113,6 @@ class OpenRouterEvaluator:
|
|||
|
||||
async def evaluate_datasets(self) -> list[dict[str, Any]]:
|
||||
"""Main async evaluation entry point."""
|
||||
all_results = []
|
||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||
return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets))
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ datasets:
|
|||
- simple_equations
|
||||
- simple_integration
|
||||
- complex_arithmetic
|
||||
eval_dir: eval/r1
|
||||
eval_dir: results/r1
|
||||
dataset_size: 50
|
||||
dataset_seed: 42
|
||||
developer_role: system
|
||||
|
|
@ -18,7 +18,7 @@ datasets:
|
|||
- word_ladder
|
||||
- word_sequence_reversal
|
||||
- word_sorting
|
||||
eval_dir: eval/r1
|
||||
eval_dir: results/r1
|
||||
dataset_size: 50
|
||||
dataset_seed: 42
|
||||
developer_role: system
|
||||
|
|
@ -5,7 +5,7 @@ datasets:
|
|||
- figlet_font
|
||||
- number_sequence
|
||||
- rubiks_cube
|
||||
eval_dir: eval/r1
|
||||
eval_dir: results/r1
|
||||
dataset_size: 50
|
||||
dataset_seed: 42
|
||||
developer_role: system
|
||||
|
|
@ -5,7 +5,7 @@ datasets:
|
|||
- self_reference
|
||||
- syllogism
|
||||
- zebra_puzzles
|
||||
eval_dir: eval/r1
|
||||
eval_dir: results/r1
|
||||
dataset_size: 50
|
||||
dataset_seed: 42
|
||||
developer_role: system
|
||||
|
|
@ -2,7 +2,7 @@ model: deepseek/deepseek-r1
|
|||
category: test
|
||||
datasets:
|
||||
- YOUR_DATASET_NAME
|
||||
eval_dir: eval/r1
|
||||
eval_dir: results/r1
|
||||
dataset_size: 10
|
||||
dataset_seed: 42
|
||||
developer_role: system
|
||||
Loading…
Add table
Add a link
Reference in a new issue