diff --git a/eval/README.md b/eval/README.md index c5dd3ce3..8146a8f7 100644 --- a/eval/README.md +++ b/eval/README.md @@ -34,7 +34,7 @@ export OPENROUTER_API_KEY=your-api-key ``` -4. Prepare your dataset configuration in YAML format (see examples in `yaml/algorithmic.yaml` or `yaml/logic.yaml`): +4. Prepare your dataset configuration in YAML format (see examples in `yaml//algorithmic.yaml` e.g `yaml/r1/algorithmic.yaml`): ```yaml model: model-name category: category-name @@ -130,8 +130,9 @@ python eval.py --yaml ``` e.g ``` -python eval.py --yaml yaml/algorithmic.yaml +python eval.py --yaml yaml/r1/algorithmic.yaml ``` +To run r1 evaluations on algorithmic.yaml The results of individual model on a dataset will be stored in a new folder in the directory E.g `r1/algorithmic/proposition_logic.json` diff --git a/eval/yaml/anthropic/algorithmic.yaml b/eval/yaml/anthropic/algorithmic.yaml new file mode 100644 index 00000000..bd2e1ccd --- /dev/null +++ b/eval/yaml/anthropic/algorithmic.yaml @@ -0,0 +1,31 @@ +model: anthropic/claude-3.5-sonnet +category: algorithmic +provider: Anthropic +datasets: + - count_primes + - game_of_life + - graph_color + - group_anagrams + - isomorphic_strings + - letter_counting + - letter_jumble + - manipulate_matrix + - number_filtering + - number_sorting + - palindrome + - pool_matrix + - ransom_note + - rotate_matrix + - sentence_reordering + - spell_backward + - spiral_matrix + - string_insertion + - string_manipulation + - string_synthesis + - word_ladder + - word_sequence_reversal + - word_sorting +eval_dir: eval/r1 +dataset_size: 50 +dataset_seed: 45 +developer_role: system diff --git a/eval/yaml/algebra.yaml b/eval/yaml/r1/algebra.yaml similarity index 100% rename from eval/yaml/algebra.yaml rename to eval/yaml/r1/algebra.yaml diff --git a/eval/yaml/algorithmic.yaml b/eval/yaml/r1/algorithmic.yaml similarity index 100% rename from eval/yaml/algorithmic.yaml rename to eval/yaml/r1/algorithmic.yaml diff --git a/eval/yaml/cognition.yaml b/eval/yaml/r1/cognition.yaml similarity index 100% rename from eval/yaml/cognition.yaml rename to eval/yaml/r1/cognition.yaml diff --git a/eval/yaml/r1/logic.yaml b/eval/yaml/r1/logic.yaml new file mode 100644 index 00000000..400c4ff3 --- /dev/null +++ b/eval/yaml/r1/logic.yaml @@ -0,0 +1,11 @@ +model: deepseek/deepseek-r1 +category: logic +datasets: + - propositional_logic + - self_reference + - syllogism + - zebra_puzzles +eval_dir: eval/r1 +dataset_size: 50 +dataset_seed: 42 +developer_role: system