From 878f9bbc7683012c0e73e528a3cd1b795e0a2cef Mon Sep 17 00:00:00 2001 From: Andreas Koepf Date: Tue, 25 Feb 2025 16:24:30 +0100 Subject: [PATCH] move r1 configs into r1 yaml/r1 subfolder --- eval/.gitignore | 1 - eval/eval.py | 2 +- eval/yaml/{ => r1}/algebra.yaml | 2 +- eval/yaml/{ => r1}/algorithmic.yaml | 2 +- eval/yaml/{ => r1}/cognition.yaml | 2 +- eval/yaml/{ => r1}/logic.yaml | 2 +- eval/yaml/{ => r1}/test.yaml | 2 +- 7 files changed, 6 insertions(+), 7 deletions(-) mode change 100644 => 100755 eval/eval.py rename eval/yaml/{ => r1}/algebra.yaml (92%) rename eval/yaml/{ => r1}/algorithmic.yaml (95%) rename eval/yaml/{ => r1}/cognition.yaml (90%) rename eval/yaml/{ => r1}/logic.yaml (89%) rename eval/yaml/{ => r1}/test.yaml (86%) diff --git a/eval/.gitignore b/eval/.gitignore index 7db6c7ce..484ab7e5 100644 --- a/eval/.gitignore +++ b/eval/.gitignore @@ -1,2 +1 @@ results/* -!results/summary* diff --git a/eval/eval.py b/eval/eval.py old mode 100644 new mode 100755 index 4eb1e12d..bec68e9c --- a/eval/eval.py +++ b/eval/eval.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import argparse import asyncio import json @@ -112,7 +113,6 @@ class OpenRouterEvaluator: async def evaluate_datasets(self) -> list[dict[str, Any]]: """Main async evaluation entry point.""" - all_results = [] async with aiohttp.ClientSession(headers=self.headers) as session: return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets)) diff --git a/eval/yaml/algebra.yaml b/eval/yaml/r1/algebra.yaml similarity index 92% rename from eval/yaml/algebra.yaml rename to eval/yaml/r1/algebra.yaml index b95c1e31..cdad5982 100644 --- a/eval/yaml/algebra.yaml +++ b/eval/yaml/r1/algebra.yaml @@ -7,7 +7,7 @@ datasets: - simple_equations - simple_integration - complex_arithmetic -eval_dir: eval/r1 +eval_dir: results/r1 dataset_size: 50 dataset_seed: 42 developer_role: system diff --git a/eval/yaml/algorithmic.yaml b/eval/yaml/r1/algorithmic.yaml similarity index 95% rename from eval/yaml/algorithmic.yaml rename to eval/yaml/r1/algorithmic.yaml index 5d0d630a..37031b9f 100644 --- a/eval/yaml/algorithmic.yaml +++ b/eval/yaml/r1/algorithmic.yaml @@ -18,7 +18,7 @@ datasets: - word_ladder - word_sequence_reversal - word_sorting -eval_dir: eval/r1 +eval_dir: results/r1 dataset_size: 50 dataset_seed: 42 developer_role: system diff --git a/eval/yaml/cognition.yaml b/eval/yaml/r1/cognition.yaml similarity index 90% rename from eval/yaml/cognition.yaml rename to eval/yaml/r1/cognition.yaml index 911a92e5..c15d7087 100644 --- a/eval/yaml/cognition.yaml +++ b/eval/yaml/r1/cognition.yaml @@ -5,7 +5,7 @@ datasets: - figlet_font - number_sequence - rubiks_cube -eval_dir: eval/r1 +eval_dir: results/r1 dataset_size: 50 dataset_seed: 42 developer_role: system diff --git a/eval/yaml/logic.yaml b/eval/yaml/r1/logic.yaml similarity index 89% rename from eval/yaml/logic.yaml rename to eval/yaml/r1/logic.yaml index 400c4ff3..57cd05d6 100644 --- a/eval/yaml/logic.yaml +++ b/eval/yaml/r1/logic.yaml @@ -5,7 +5,7 @@ datasets: - self_reference - syllogism - zebra_puzzles -eval_dir: eval/r1 +eval_dir: results/r1 dataset_size: 50 dataset_seed: 42 developer_role: system diff --git a/eval/yaml/test.yaml b/eval/yaml/r1/test.yaml similarity index 86% rename from eval/yaml/test.yaml rename to eval/yaml/r1/test.yaml index b6956a2a..1660fd82 100644 --- a/eval/yaml/test.yaml +++ b/eval/yaml/r1/test.yaml @@ -2,7 +2,7 @@ model: deepseek/deepseek-r1 category: test datasets: - YOUR_DATASET_NAME -eval_dir: eval/r1 +eval_dir: results/r1 dataset_size: 10 dataset_seed: 42 developer_role: system