move r1 configs into r1 yaml/r1 subfolder

This commit is contained in:
Andreas Koepf 2025-02-25 16:24:30 +01:00
parent e7ae82a831
commit 878f9bbc76
7 changed files with 6 additions and 7 deletions

1
eval/.gitignore vendored
View file

@ -1,2 +1 @@
results/* results/*
!results/summary*

2
eval/eval.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python
import argparse import argparse
import asyncio import asyncio
import json import json
@ -112,7 +113,6 @@ class OpenRouterEvaluator:
async def evaluate_datasets(self) -> list[dict[str, Any]]: async def evaluate_datasets(self) -> list[dict[str, Any]]:
"""Main async evaluation entry point.""" """Main async evaluation entry point."""
all_results = []
async with aiohttp.ClientSession(headers=self.headers) as session: async with aiohttp.ClientSession(headers=self.headers) as session:
return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets)) return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets))

View file

@ -7,7 +7,7 @@ datasets:
- simple_equations - simple_equations
- simple_integration - simple_integration
- complex_arithmetic - complex_arithmetic
eval_dir: eval/r1 eval_dir: results/r1
dataset_size: 50 dataset_size: 50
dataset_seed: 42 dataset_seed: 42
developer_role: system developer_role: system

View file

@ -18,7 +18,7 @@ datasets:
- word_ladder - word_ladder
- word_sequence_reversal - word_sequence_reversal
- word_sorting - word_sorting
eval_dir: eval/r1 eval_dir: results/r1
dataset_size: 50 dataset_size: 50
dataset_seed: 42 dataset_seed: 42
developer_role: system developer_role: system

View file

@ -5,7 +5,7 @@ datasets:
- figlet_font - figlet_font
- number_sequence - number_sequence
- rubiks_cube - rubiks_cube
eval_dir: eval/r1 eval_dir: results/r1
dataset_size: 50 dataset_size: 50
dataset_seed: 42 dataset_seed: 42
developer_role: system developer_role: system

View file

@ -5,7 +5,7 @@ datasets:
- self_reference - self_reference
- syllogism - syllogism
- zebra_puzzles - zebra_puzzles
eval_dir: eval/r1 eval_dir: results/r1
dataset_size: 50 dataset_size: 50
dataset_seed: 42 dataset_seed: 42
developer_role: system developer_role: system

View file

@ -2,7 +2,7 @@ model: deepseek/deepseek-r1
category: test category: test
datasets: datasets:
- YOUR_DATASET_NAME - YOUR_DATASET_NAME
eval_dir: eval/r1 eval_dir: results/r1
dataset_size: 10 dataset_size: 10
dataset_seed: 42 dataset_seed: 42
developer_role: system developer_role: system