From 791f16ec0f3d502c9a628445708b575930e81de1 Mon Sep 17 00:00:00 2001 From: Andreas Koepf Date: Tue, 25 Feb 2025 19:41:21 +0100 Subject: [PATCH] use results folder name for eval results --- eval/README.md | 6 +++--- eval/yaml/example.yaml | 8 ++++++++ eval/yaml/r1/algorithmic.yaml | 2 +- eval/yaml/r1/test.yaml | 8 -------- 4 files changed, 12 insertions(+), 12 deletions(-) create mode 100644 eval/yaml/example.yaml delete mode 100644 eval/yaml/r1/test.yaml diff --git a/eval/README.md b/eval/README.md index 72050a41..3bf8d750 100644 --- a/eval/README.md +++ b/eval/README.md @@ -42,7 +42,7 @@ category: category-name datasets: - dataset1 - dataset2 -eval_dir: eval/r1 +eval_dir: results/model-name dataset_size: 50 dataset_seed: 42 developer_role: system @@ -81,7 +81,7 @@ datasets: - word_ladder - word_sequence_reversal - word_sorting -eval_dir: eval/r1 +eval_dir: results/deepseek-r1 dataset_size: 50 dataset_seed: 45 developer_role: system @@ -117,7 +117,7 @@ datasets: - word_ladder - word_sequence_reversal - word_sorting -eval_dir: eval/r1 +eval_dir: results/claude-3.5-sonnet dataset_size: 50 dataset_seed: 45 developer_role: system diff --git a/eval/yaml/example.yaml b/eval/yaml/example.yaml new file mode 100644 index 00000000..44dd4154 --- /dev/null +++ b/eval/yaml/example.yaml @@ -0,0 +1,8 @@ +model: anthropic/claude-3.7-sonnet # find model id: https://openrouter.ai/models +category: test +datasets: + - YOUR_DATASET_NAME +eval_dir: results/test +dataset_size: 100 +dataset_seed: 42 +developer_role: system diff --git a/eval/yaml/r1/algorithmic.yaml b/eval/yaml/r1/algorithmic.yaml index ea7bbd8a..9db9c370 100644 --- a/eval/yaml/r1/algorithmic.yaml +++ b/eval/yaml/r1/algorithmic.yaml @@ -28,7 +28,7 @@ datasets: - word_ladder - word_sequence_reversal - word_sorting -eval_dir: eval/r1 +eval_dir: results/r1 dataset_size: 50 dataset_seed: 45 developer_role: system diff --git a/eval/yaml/r1/test.yaml b/eval/yaml/r1/test.yaml deleted file mode 100644 index 1660fd82..00000000 --- a/eval/yaml/r1/test.yaml +++ /dev/null @@ -1,8 +0,0 @@ -model: deepseek/deepseek-r1 -category: test -datasets: - - YOUR_DATASET_NAME -eval_dir: results/r1 -dataset_size: 10 -dataset_seed: 42 -developer_role: system