move r1 configs into r1 yaml/r1 subfolder

This commit is contained in:
Andreas Koepf 2025-02-25 16:24:30 +01:00
parent e7ae82a831
commit 878f9bbc76
7 changed files with 6 additions and 7 deletions

1
eval/.gitignore vendored
View file

@ -1,2 +1 @@
results/*
!results/summary*

2
eval/eval.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python
import argparse
import asyncio
import json
@ -112,7 +113,6 @@ class OpenRouterEvaluator:
async def evaluate_datasets(self) -> list[dict[str, Any]]:
"""Main async evaluation entry point."""
all_results = []
async with aiohttp.ClientSession(headers=self.headers) as session:
return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets))

View file

@ -7,7 +7,7 @@ datasets:
- simple_equations
- simple_integration
- complex_arithmetic
eval_dir: eval/r1
eval_dir: results/r1
dataset_size: 50
dataset_seed: 42
developer_role: system

View file

@ -18,7 +18,7 @@ datasets:
- word_ladder
- word_sequence_reversal
- word_sorting
eval_dir: eval/r1
eval_dir: results/r1
dataset_size: 50
dataset_seed: 42
developer_role: system

View file

@ -5,7 +5,7 @@ datasets:
- figlet_font
- number_sequence
- rubiks_cube
eval_dir: eval/r1
eval_dir: results/r1
dataset_size: 50
dataset_seed: 42
developer_role: system

View file

@ -5,7 +5,7 @@ datasets:
- self_reference
- syllogism
- zebra_puzzles
eval_dir: eval/r1
eval_dir: results/r1
dataset_size: 50
dataset_seed: 42
developer_role: system

View file

@ -2,7 +2,7 @@ model: deepseek/deepseek-r1
category: test
datasets:
- YOUR_DATASET_NAME
eval_dir: eval/r1
eval_dir: results/r1
dataset_size: 10
dataset_seed: 42
developer_role: system