mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
Eval N completions per prompt (#374)
* feat: Add support for generating multiple completions per prompt * feat: Track best and mean scores for multiple completions per prompt * feat: Add checkpoint and resume functionality to evaluation script
This commit is contained in:
parent
bd13b1b92a
commit
bfa5f8078b
12 changed files with 426 additions and 126 deletions
|
|
@ -89,6 +89,7 @@ categories:
|
|||
- dataset: rubiks_cube
|
||||
- category: games
|
||||
datasets:
|
||||
- dataset: boxnet
|
||||
- dataset: countdown
|
||||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
|
|
|
|||
|
|
@ -89,6 +89,7 @@ categories:
|
|||
- dataset: rubiks_cube
|
||||
- category: games
|
||||
datasets:
|
||||
- dataset: boxnet
|
||||
- dataset: countdown
|
||||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
|
|
|
|||
|
|
@ -1,15 +0,0 @@
|
|||
model: deepseek/deepseek-r1
|
||||
provider: Nebius
|
||||
output_dir: results/deepseek-r1_algebra
|
||||
max_concurrent: 10
|
||||
default_size: 50
|
||||
default_seed: 45
|
||||
categories:
|
||||
- category: algebra
|
||||
datasets:
|
||||
- dataset: complex_arithmetic
|
||||
- dataset: intermediate_integration
|
||||
- dataset: polynomial_equations
|
||||
- dataset: polynomial_multiplication
|
||||
- dataset: simple_equations
|
||||
- dataset: simple_integration
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
model: deepseek/deepseek-r1
|
||||
provider: Nebius
|
||||
output_dir: results/deepseek-r1_arc
|
||||
max_concurrent: 10
|
||||
default_size: 50
|
||||
default_seed: 45
|
||||
categories:
|
||||
- category: arc
|
||||
datasets:
|
||||
- dataset: arc_1d
|
||||
- dataset: arc_agi
|
||||
- dataset: rearc
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
model: deepseek/deepseek-r1
|
||||
provider: Nebius
|
||||
output_dir: results/deepseek-r1_logic
|
||||
max_concurrent: 10
|
||||
default_size: 50
|
||||
default_seed: 45
|
||||
categories:
|
||||
- category: logic
|
||||
datasets:
|
||||
- dataset: aiw
|
||||
- dataset: circuit_logic
|
||||
- dataset: knights_knaves
|
||||
- dataset: propositional_logic
|
||||
- dataset: self_reference
|
||||
- dataset: syllogism
|
||||
- dataset: zebra_puzzles
|
||||
|
|
@ -89,6 +89,7 @@ categories:
|
|||
- dataset: rubiks_cube
|
||||
- category: games
|
||||
datasets:
|
||||
- dataset: boxnet
|
||||
- dataset: countdown
|
||||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
|
|
|
|||
|
|
@ -89,6 +89,7 @@ categories:
|
|||
- dataset: rubiks_cube
|
||||
- category: games
|
||||
datasets:
|
||||
- dataset: boxnet
|
||||
- dataset: countdown
|
||||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
|
|
|
|||
|
|
@ -1,9 +0,0 @@
|
|||
model: openai/o1
|
||||
category: games
|
||||
provider: OpenAI
|
||||
datasets:
|
||||
- emoji_mystery
|
||||
eval_dir: eval/openai-01
|
||||
dataset_size: 50
|
||||
dataset_seed: 45
|
||||
developer_role: system
|
||||
|
|
@ -89,6 +89,7 @@ categories:
|
|||
- dataset: rubiks_cube
|
||||
- category: games
|
||||
datasets:
|
||||
- dataset: boxnet
|
||||
- dataset: countdown
|
||||
- dataset: emoji_mystery
|
||||
- dataset: futoshiki
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue