Eval N completions per prompt (#374)

* feat: Add support for generating multiple completions per prompt
* feat: Track best and mean scores for multiple completions per prompt
* feat: Add checkpoint and resume functionality to evaluation script
This commit is contained in:
Andreas Köpf 2025-03-15 16:39:36 +01:00 committed by GitHub
parent bd13b1b92a
commit bfa5f8078b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 426 additions and 126 deletions

View file

@ -89,6 +89,7 @@ categories:
- dataset: rubiks_cube
- category: games
datasets:
- dataset: boxnet
- dataset: countdown
- dataset: emoji_mystery
- dataset: futoshiki

View file

@ -89,6 +89,7 @@ categories:
- dataset: rubiks_cube
- category: games
datasets:
- dataset: boxnet
- dataset: countdown
- dataset: emoji_mystery
- dataset: futoshiki

View file

@ -1,15 +0,0 @@
model: deepseek/deepseek-r1
provider: Nebius
output_dir: results/deepseek-r1_algebra
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: algebra
datasets:
- dataset: complex_arithmetic
- dataset: intermediate_integration
- dataset: polynomial_equations
- dataset: polynomial_multiplication
- dataset: simple_equations
- dataset: simple_integration

View file

@ -1,12 +0,0 @@
model: deepseek/deepseek-r1
provider: Nebius
output_dir: results/deepseek-r1_arc
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: arc
datasets:
- dataset: arc_1d
- dataset: arc_agi
- dataset: rearc

View file

@ -1,16 +0,0 @@
model: deepseek/deepseek-r1
provider: Nebius
output_dir: results/deepseek-r1_logic
max_concurrent: 10
default_size: 50
default_seed: 45
categories:
- category: logic
datasets:
- dataset: aiw
- dataset: circuit_logic
- dataset: knights_knaves
- dataset: propositional_logic
- dataset: self_reference
- dataset: syllogism
- dataset: zebra_puzzles

View file

@ -89,6 +89,7 @@ categories:
- dataset: rubiks_cube
- category: games
datasets:
- dataset: boxnet
- dataset: countdown
- dataset: emoji_mystery
- dataset: futoshiki

View file

@ -89,6 +89,7 @@ categories:
- dataset: rubiks_cube
- category: games
datasets:
- dataset: boxnet
- dataset: countdown
- dataset: emoji_mystery
- dataset: futoshiki

View file

@ -1,9 +0,0 @@
model: openai/o1
category: games
provider: OpenAI
datasets:
- emoji_mystery
eval_dir: eval/openai-01
dataset_size: 50
dataset_seed: 45
developer_role: system

View file

@ -89,6 +89,7 @@ categories:
- dataset: rubiks_cube
- category: games
datasets:
- dataset: boxnet
- dataset: countdown
- dataset: emoji_mystery
- dataset: futoshiki