changed structure

2026-04-24 17:05:03 +00:00 · 2025-02-25 16:32:42 +00:00 · 2025-02-25 16:32:42 +00:00 · 68e8ea89d8
commit 68e8ea89d8
parent ce8877167d
6 changed files with 45 additions and 2 deletions
--- a/eval/README.md
+++ b/eval/README.md
@ -34,7 +34,7 @@ export OPENROUTER_API_KEY=your-api-key
 ```
-4. Prepare your dataset configuration in YAML format (see examples in `yaml/algorithmic.yaml` or `yaml/logic.yaml`):
+4. Prepare your dataset configuration in YAML format (see examples in `yaml/<model_name>/algorithmic.yaml` e.g `yaml/r1/algorithmic.yaml`):
 ```yaml
 model: model-name
 category: category-name
@ -130,8 +130,9 @@ python eval.py --yaml <path-to yaml file>
 ```
 e.g
 ```
-python eval.py --yaml yaml/algorithmic.yaml
+python eval.py --yaml yaml/r1/algorithmic.yaml
 ```
 To run r1 evaluations on algorithmic.yaml
 The results of individual model on a dataset will be stored in a new folder in the directory E.g `r1/algorithmic/proposition_logic.json`
--- a/eval/yaml/anthropic/algorithmic.yaml
+++ b/eval/yaml/anthropic/algorithmic.yaml
@ -0,0 +1,31 @@
 model: anthropic/claude-3.5-sonnet
 category: algorithmic
 provider: Anthropic
 datasets:
  -  count_primes
  -  game_of_life
  -  graph_color
  -  group_anagrams
  -  isomorphic_strings
  -  letter_counting
  -  letter_jumble
  -  manipulate_matrix
  -  number_filtering
  -  number_sorting
  -  palindrome
  -  pool_matrix
  -  ransom_note
  -  rotate_matrix
  -  sentence_reordering
  -  spell_backward
  -  spiral_matrix
  -  string_insertion
  -  string_manipulation
  -  string_synthesis
  -  word_ladder
  -  word_sequence_reversal
  -  word_sorting
 eval_dir: eval/r1
 dataset_size: 50
 dataset_seed: 45
 developer_role: system
--- a/eval/yaml/r1/algebra.yaml
+++ b/eval/yaml/r1/algebra.yaml
--- a/eval/yaml/r1/algorithmic.yaml
+++ b/eval/yaml/r1/algorithmic.yaml
--- a/eval/yaml/r1/cognition.yaml
+++ b/eval/yaml/r1/cognition.yaml
--- a/eval/yaml/r1/logic.yaml
+++ b/eval/yaml/r1/logic.yaml
@ -0,0 +1,11 @@
 model: deepseek/deepseek-r1
 category: logic
 datasets:
  - propositional_logic
  - self_reference
  - syllogism
  - zebra_puzzles
 eval_dir: eval/r1
 dataset_size: 50
 dataset_seed: 42
 developer_role: system