mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-24 17:05:03 +00:00
move r1 configs into r1 yaml/r1 subfolder
This commit is contained in:
parent
e7ae82a831
commit
878f9bbc76
7 changed files with 6 additions and 7 deletions
1
eval/.gitignore
vendored
1
eval/.gitignore
vendored
|
|
@ -1,2 +1 @@
|
||||||
results/*
|
results/*
|
||||||
!results/summary*
|
|
||||||
|
|
|
||||||
2
eval/eval.py
Normal file → Executable file
2
eval/eval.py
Normal file → Executable file
|
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
|
@ -112,7 +113,6 @@ class OpenRouterEvaluator:
|
||||||
|
|
||||||
async def evaluate_datasets(self) -> list[dict[str, Any]]:
|
async def evaluate_datasets(self) -> list[dict[str, Any]]:
|
||||||
"""Main async evaluation entry point."""
|
"""Main async evaluation entry point."""
|
||||||
all_results = []
|
|
||||||
async with aiohttp.ClientSession(headers=self.headers) as session:
|
async with aiohttp.ClientSession(headers=self.headers) as session:
|
||||||
return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets))
|
return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ datasets:
|
||||||
- simple_equations
|
- simple_equations
|
||||||
- simple_integration
|
- simple_integration
|
||||||
- complex_arithmetic
|
- complex_arithmetic
|
||||||
eval_dir: eval/r1
|
eval_dir: results/r1
|
||||||
dataset_size: 50
|
dataset_size: 50
|
||||||
dataset_seed: 42
|
dataset_seed: 42
|
||||||
developer_role: system
|
developer_role: system
|
||||||
|
|
@ -18,7 +18,7 @@ datasets:
|
||||||
- word_ladder
|
- word_ladder
|
||||||
- word_sequence_reversal
|
- word_sequence_reversal
|
||||||
- word_sorting
|
- word_sorting
|
||||||
eval_dir: eval/r1
|
eval_dir: results/r1
|
||||||
dataset_size: 50
|
dataset_size: 50
|
||||||
dataset_seed: 42
|
dataset_seed: 42
|
||||||
developer_role: system
|
developer_role: system
|
||||||
|
|
@ -5,7 +5,7 @@ datasets:
|
||||||
- figlet_font
|
- figlet_font
|
||||||
- number_sequence
|
- number_sequence
|
||||||
- rubiks_cube
|
- rubiks_cube
|
||||||
eval_dir: eval/r1
|
eval_dir: results/r1
|
||||||
dataset_size: 50
|
dataset_size: 50
|
||||||
dataset_seed: 42
|
dataset_seed: 42
|
||||||
developer_role: system
|
developer_role: system
|
||||||
|
|
@ -5,7 +5,7 @@ datasets:
|
||||||
- self_reference
|
- self_reference
|
||||||
- syllogism
|
- syllogism
|
||||||
- zebra_puzzles
|
- zebra_puzzles
|
||||||
eval_dir: eval/r1
|
eval_dir: results/r1
|
||||||
dataset_size: 50
|
dataset_size: 50
|
||||||
dataset_seed: 42
|
dataset_seed: 42
|
||||||
developer_role: system
|
developer_role: system
|
||||||
|
|
@ -2,7 +2,7 @@ model: deepseek/deepseek-r1
|
||||||
category: test
|
category: test
|
||||||
datasets:
|
datasets:
|
||||||
- YOUR_DATASET_NAME
|
- YOUR_DATASET_NAME
|
||||||
eval_dir: eval/r1
|
eval_dir: results/r1
|
||||||
dataset_size: 10
|
dataset_size: 10
|
||||||
dataset_seed: 42
|
dataset_seed: 42
|
||||||
developer_role: system
|
developer_role: system
|
||||||
Loading…
Add table
Add a link
Reference in a new issue